source: main/trunk/greenstone2/perllib/plugins/EmailPlugin.pm@ 31742

Last change on this file since 31742 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.1 KB
RevLine 
[638]1###########################################################################
2#
[15872]3# EmailPlugin.pm - a plugin for parsing email files
[638]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
[3630]9# Copyright (C) 1999-2002 New Zealand Digital Library Project
[638]10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
[1206]28
[15872]29# EmailPlugin
[638]30#
[1206]31# by Gordon Paynter ([email protected])
[638]32#
[1206]33# Email plug reads email files. These are named with a simple
[2096]34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
[638]36#
37# Document text:
[1206]38# The document text consists of all the text
39# after the first blank line in the document.
[638]40#
[2730]41# Metadata (not Dublin Core!):
[6916]42# $Headers All the header content (optional, not stored by default)
[638]43# $Subject Subject: header
44# $To To: header
[2730]45# $From From: header
[2630]46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
[638]48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
[2630]50#
[2918]51# $Title made up of Subject, Date and Sender (for default formatting)
[19281]52# $InReplyTo Message id of the one this replies to
[2630]53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
[2730]61# * RFC 2047 - MIME (part 3) Message Header Extensions
[2630]62# * RFC 1806 - Content Dispositions (ie inline/attachment)
[3540]63
[16341]64
[15872]65package EmailPlugin;
[3540]66
[7830]67use strict;
68no strict "refs"; # so we can use a variable as a filehandle for print $out
69
[638]70
[15872]71use SplitTextFile;
[7830]72use unicode; # gs conv functions
73use gsprintf 'gsprintf'; # translations
[2730]74
[638]75use sorttools;
[28563]76use FileUtils;
[638]77
[10254]78sub BEGIN {
[15872]79 @EmailPlugin::ISA = ('SplitTextFile');
[10254]80}
[638]81
[17026]82my $extended_oidtype_list =
83 [ {'name' => "message_id",
84 'desc' => "{EmailPlugin.OIDtype.message_id}" }
85 ];
[638]86
[31492]87# add in all the standard options from BaseImporter
88unshift (@$extended_oidtype_list, @{$BaseImporter::oidtype_list});
[17026]89
[3540]90my $arguments =
[4744]91 [ { 'name' => "process_exp",
[31492]92 'desc' => "{BaseImporter.process_exp}",
[6408]93 'type' => "regexp",
[3540]94 'reqd' => "no",
[4744]95 'deft' => &get_default_process_exp() },
96 { 'name' => "no_attachments",
[15872]97 'desc' => "{EmailPlugin.no_attachments}",
[3630]98 'type' => "flag",
[4744]99 'reqd' => "no" },
[6916]100 { 'name' => "headers",
[15872]101 'desc' => "{EmailPlugin.headers}",
[6916]102 'type' => "flag",
103 'reqd' => "no" },
[17026]104 { 'name' => "OIDtype",
105 'desc' => "{import.OIDtype}",
106 'type' => "enum",
107 'list' => $extended_oidtype_list,
[19281]108 'deft' => "message_id",
[18591]109 'reqd' => "no" },
[17026]110 { 'name' => "OIDmetadata",
111 'desc' => "{import.OIDmetadata}",
112 'type' => "metadata",
113 'deft' => "dc.Identifier",
[18591]114 'reqd' => "no" },
[4744]115 { 'name' => "split_exp",
[15872]116 'desc' => "{EmailPlugin.split_exp}",
[6408]117 'type' => "regexp",
118 'reqd' => "no",
119 'deft' => &get_default_split_exp() }
120 ];
[3540]121
[15872]122my $options = { 'name' => "EmailPlugin",
123 'desc' => "{EmailPlugin.desc}",
[6408]124 'abstract' => "no",
125 'inherits' => "yes",
[4744]126 'args' => $arguments };
[3540]127
[638]128sub new {
[10218]129 my ($class) = shift (@_);
130 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
131 push(@$pluginlist, $class);
[3540]132
[15872]133 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
134 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]135
[15872]136 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]137
[3630]138 $self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
[16341]139 $self->{'tmp_file_paths'} = (); # list of tmp files to delete after processing is finished
[3630]140
[2730]141 # this might not actually be true at read-time, but after processing
142 # it should all be utf8.
143 $self->{'input_encoding'}="utf8";
[638]144 return bless $self, $class;
145}
146
[1244]147sub get_default_process_exp {
148 my $self = shift (@_);
[2096]149 # mbx/email for mailbox file format, \d+ for maildir (each message is
150 # in a separate file, with a unique number for filename)
[3111]151 # mozilla and IE will save individual mbx format files with a ".eml" ext.
[25741]152 return q@([\\/]\d+|\.(mbo?x|email|eml))$@;
[638]153}
154
[1895]155# This plugin splits the mbox mail files at lines starting with From<sp>
[3111]156# It is supposed to be "\n\nFrom ", but this isn't always used.
[9971]157# add \d{4} so that the line ends in a year (in case the text has an
158# unescaped "From " at the start of a line).
[1895]159sub get_default_split_exp {
[9971]160 return q^\nFrom .*\d{4}\n^;
[3523]161
[1895]162}
163
[16677]164sub can_process_this_file {
165 my $self = shift(@_);
166 my ($filename) = @_;
[1895]167
[16677]168 # avoid any confusion between filenames matching \d+ (which are by default
169 # matched by EmailPlugin) and directories that match \d+ (which should not)
170
171 return 0 if (-d $filename);
172
173 if ($self->{'process_exp'} ne "" && $filename =~ /$self->{'process_exp'}/) {
174 return 1;
175 }
176 return 0;
177
178}
179
180
[1244]181# do plugin specific processing of doc_obj
182sub process {
[2630]183
[638]184 my $self = shift (@_);
[6332]185 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1424]186 my $outhandle = $self->{'outhandle'};
187
[1244]188 # Check that we're dealing with a valid mail file
[3111]189 # mbox message files start with "From "
190 # maildir messages usually start with Return-Path and Delivered-To
191 # mh is very similar to maildir
192 my $startoffile=substr($$textref,0,256);
193 if (($startoffile !~ /^(From )/) &&
194 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
195 return undef;
196 }
[25741]197
[1244]198 my $cursection = $doc_obj->get_top_section();
[638]199
[1206]200 #
201 # Parse the document's text and extract metadata
202 #
[638]203
[2652]204 # Protect backslashes
205 $$textref =~ s@\\@\\\\@g;
206
[1206]207 # Separate header from body of message
[1244]208 my $Headers = $$textref;
[2630]209 $Headers =~ s/\r?\n\r?\n(.*)$//s;
210 $$textref = $1;
[4089]211 $Headers .= "\n";
[2779]212
[2630]213 # Unfold headers - see rfc822
214 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
[1206]215 # Extract basic metadata from header
216 my @headers = ("From", "To", "Subject", "Date");
217 my %raw;
[1658]218 foreach my $name (@headers) {
219 $raw{$name} = "No $name value";
220 }
[1206]221
[3132]222 # Get a default encoding for the header - RFC says should be ascii...
[6062]223 my $default_header_encoding="iso_8859_1";
[3132]224
225 # We don't know what character set is the user's default...
226 # We could use textcat to guess... for now we'll look at mime content-type
227# if ($Headers =~ /([[:^ascii:]])/) {
228# }
229 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
230 $default_header_encoding=$1;
231 $default_header_encoding =~ s@\-@_@g;
232 $default_header_encoding =~ tr/A-Z/a-z/;
233 }
234
235
[1658]236 # Examine each line of the headers
237 my ($line, $name, $value);
238 my @parts;
239 foreach $line (split(/\n/, $Headers)) {
240
241 # Ignore lines with no content or which begin with whitespace
242 next unless ($line =~ /:/);
243 next if ($line =~ /^\s/);
244
245 # Find out what metadata is on this line
246 @parts = split(/:/, $line);
247 $name = shift @parts;
[3136]248 # get fieldname in canonical form - first cap, then lower case.
[3134]249 $name =~ tr/A-Z/a-z/;
[3136]250 # uppercase the first character according to the current locale
[2630]251 $name=~s/(.+)/\u$1/;
[1658]252 next unless $name;
253 next unless ($raw{$name});
254
255 # Find the value of that metadata
256 $value = join(":", @parts);
257 $value =~ s/^\s+//;
258 $value =~ s/\s+$//;
[16341]259 # decode header values, using either =?<charset>?[BQ]?<data>?= (rfc2047) or default_header_encoding
260 $self->decode_header_value($default_header_encoding, \$value);
[2730]261
[1658]262 # Store the metadata
[6062]263 $value =~ s@_@\\_@g; # protect against GS macro language
[1206]264 $raw{$name} = $value;
265 }
266
[2630]267 # Extract the name and e-mail address from the From metadata
[6062]268 my $frommeta = $raw{"From"};
[2680]269 my $fromnamemeta;
270 my $fromaddrmeta;
271
272 $frommeta =~ s/\s*$//; # Remove trailing space, if any
273
274 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
275 $fromnamemeta=$1;
276 $fromaddrmeta=$2;
277 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
278 $fromnamemeta=$2;
279 $fromaddrmeta=$1;
[19283]280 } elsif ($frommeta =~ m/(.+)\s+at\s+(.+)\s+\((.*)\)/) {
281 $fromnamemeta=$3;
282 $fromaddrmeta="$1&#64;$2";
283 }
284
[2630]285 if (!defined($fromaddrmeta)) {
286 $fromaddrmeta=$frommeta;
287 }
[2680]288 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
[2717]289 # minor attempt to prevent spam-bots from harvesting addresses...
290 $fromaddrmeta=~s/@/&#64;/;
[6062]291
[2630]292 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
293
[3328]294 if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
[3215]295 $fromnamemeta =~ s/\"//g; # remove quotes
296 $fromnamemeta =~ s/\s+$//; # remove trailing whitespace
[2630]297 }
298 else {
299 $fromnamemeta = $fromaddrmeta;
300 }
301 # if name is an address
302 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
[2717]303 $fromnamemeta=~s/@/&#64\;/;
[2630]304 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
305
306 $raw{"From"}=$frommeta;
307
[1206]308 # Process Date information
[1658]309 if ($raw{"Date"} !~ /No Date/) {
[1206]310 $raw{"DateText"} = $raw{"Date"};
[638]311
[1206]312 # Convert the date text to internal date format
313 $value = $raw{"Date"};
[15212]314 # proper mbox format: Tue, 07 Jan 2003 17:27:42 +1300
[1206]315 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
[15212]316 if (!defined($day) || !defined($month) || !defined ($year)) {
317 # try monthly archive format: Wed Apr 23 00:26:08 2008
318 ($month,$day, $year) = $value =~ /([A-Z][a-z][a-z])\s\s?(\d?\d)\s\d\d:\d\d:\d\d\s(\d\d\d\d)/;
319 }
320
[3143]321 # make some assumptions about the year formatting...
322 # some (old) software thinks 2001 is 101, some think 2001 is 01
323 if ($year < 20) { $year += 2000; } # assume not really 1920...
324 elsif ($year < 150) { $year += 1900; } # assume not really 2150...
[1206]325 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
[638]326
[1206]327 } else {
328 # We have not extracted a date
329 $raw{"DateText"} = "Unknown.";
330 $raw{"Date"} = "19000000";
[638]331 }
332
[1244]333 # Add extracted metadata to document object
[1206]334 foreach my $name (keys %raw) {
335 $value = $raw{$name};
336 if ($value) {
[2730]337 # assume subject, etc headers have no special HTML meaning.
[1206]338 $value = &text_into_html($value);
[2730]339 # escape [] so it isn't re-interpreted as metadata
340 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
[1206]341 } else {
342 $value = "No $name field";
343 }
[1244]344 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
[1206]345 }
[638]346
[7703]347
348 # extract a message ID from the headers, if there is one, and we'll use
349 # that as the greenstone doc ID. Having a predictable ID means we can
350 # link to other messages, eg from In-Reply-To or References headers...
351 if ($Headers =~ m@^Message-ID:(.+)$@mi) {
352 my $id=escape_msg_id($1);
353 $doc_obj->{'msgid'}=$id;
354 }
355 # link to another message, if this is a reply
356 if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
357 my $id=escape_msg_id($1);
358 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
359 } elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
360 # References can have multiple, get the last one
361 my $id=escape_msg_id($1);
362 # not necessarily in-reply-to, but same thread...
363 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
364 }
365
366
367
[2630]368 my $mimetype="text/plain";
369 my $mimeinfo="";
[3132]370 my $charset = $default_header_encoding;
[3073]371 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
372 # more than one parameter given to Content-type.
373 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
[3630]374 if ($Headers =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;\s*.+)?\s*$@mi)
[2630]375 {
376 $mimetype=$1;
377 $mimetype =~ tr/[A-Z]/[a-z]/;
[3073]378
379 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
380 $mimetype = "text/plain";
381 }
382
[2630]383 $mimeinfo=$2;
[3073]384 if (!defined $mimeinfo) {
385 $mimeinfo="";
386 } else { # strip leading and trailing stuff
387 $mimeinfo =~ s/^\;\s*//;
388 $mimeinfo =~ s/\s*$//;
389 }
390 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
[2847]391 $charset = $1;
392 }
[2630]393 }
394
[2680]395 my $transfer_encoding="7bit";
396 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
397 $transfer_encoding=$1;
398 }
[6062]399
[2886]400 if ($mimetype eq "text/html") {
[9971]401 $$textref= $self->text_from_part($$textref, $Headers);
[2886]402 } elsif ($mimetype ne "text/plain") {
[3630]403 $self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
[16341]404 $$textref=$self->text_from_mime_message($mimetype,$mimeinfo,$default_header_encoding,$$textref);
[6062]405 } else { # mimetype eq text/plain
406
407 if ($transfer_encoding =~ /quoted\-printable/) {
408 $$textref=qp_decode($$textref);
409 } elsif ($transfer_encoding =~ /base64/) {
410 $$textref=base64_decode($$textref);
411 }
[2886]412 $self->convert2unicode($charset, $textref);
[8904]413
414 $$textref = &text_into_html($$textref);
415 $$textref =~ s@_@\\_@g; # protect against GS macro language
416
[2680]417 }
[6062]418
[2630]419
[10218]420 if ($self->{'headers'} && $self->{'headers'} == 1) {
[6916]421 # Add "All headers" metadata
422 $Headers = &text_into_html($Headers);
[2630]423
[6916]424 $Headers = "No headers" unless ($Headers =~ /\w/);
425 $Headers =~ s/@/&#64\;/g;
426 # escape [] so it isn't re-interpreted as metadata
427 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
428 $self->convert2unicode($charset, \$Headers);
[2754]429
[6916]430 $Headers =~ s@_@\\_@g; # protect against GS macro language
431 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
432 }
[6062]433
434
[2918]435 # Add Title metadata
436 my $Title = text_into_html($raw{'Subject'});
[7703]437 $Title .= "<br>From: " . text_into_html($fromnamemeta);
[2918]438 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
[3073]439 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
[2918]440
441 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
442
[8121]443 # Add FileFormat metadata
444 $doc_obj->add_metadata($cursection, "FileFormat", "EMAIL");
[2918]445
[1244]446 # Add text to document object
447 $$textref = "No message" unless ($$textref =~ /\w/);
[6062]448
[1244]449 $doc_obj->add_utf8_text($cursection, $$textref);
[638]450
[1244]451 return 1;
[638]452}
453
[16341]454# delete any temp files that we have created
455sub clean_up_after_doc_obj_processing {
456 my $self = shift(@_);
457
458 foreach my $tmp_file_path (@{$self->{'tmp_file_paths'}}) {
459 if (-e $tmp_file_path) {
[28563]460 &FileUtils::removeFiles($tmp_file_path);
[16341]461 }
462 }
463
464}
[638]465
466# Convert a text string into HTML.
467#
468# The HTML is going to be inserted into a GML file, so
469# we have to be careful not to use symbols like ">",
470# which ocurs frequently in email messages (and use
471# &gt instead.
472#
473# This function also turns links and email addresses into hyperlinks,
474# and replaces carriage returns with <BR> tags (and multiple carriage
475# returns with <P> tags).
476
[1206]477
[638]478sub text_into_html {
479 my ($text) = @_;
480
[1244]481 # Convert problem characters into HTML symbols
[3132]482 $text =~ s/&/&amp;/g;
483 $text =~ s/</&lt;/g;
484 $text =~ s/>/&gt;/g;
485 $text =~ s/\"/&quot;/g;
[638]486
[2630]487 # convert email addresses and URIs into links
488# don't markup email addresses for now
489# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
[638]490
[2918]491 # try to munge email addresses a little bit...
492 $text =~ s/@/&#64;/;
[2730]493 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
494 # assume URI doesn't finish with a '.'
[10827]495 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.|\%[a-f0-9]{2})?[\w\?\=\-_/~]+)*(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@gi;
[2630]496
497
[638]498 # Clean up whitespace and convert \n charaters to <BR> or <P>
[3132]499 $text =~ s/ +/ /g;
500 $text =~ s/\s*$//g;
501 $text =~ s/^\s*//g;
502 $text =~ s/\n/\n<br>/g;
503 $text =~ s/<br>\s*<br>/<p>/gi;
[638]504
505 return $text;
506}
507
508
[2630]509
510
511#Process a MIME message.
512# the textref we are given DOES NOT include the header.
513sub text_from_mime_message {
[2847]514 my $self = shift(@_);
[16341]515 my ($mimetype,$mimeinfo,$default_header_encoding,$text)=(@_);
[6062]516 my $outhandle=$self->{'outhandle'};
[2630]517 # Check for multiparts - $mimeinfo will be a boundary
518 if ($mimetype =~ /multipart/) {
[6062]519 my $boundary="";
[2732]520 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
[2630]521 $boundary=$1;
[2732]522 if ($boundary =~ m@^\"@) {
523 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
524 }
525 } else {
[15872]526 print $outhandle "EmailPlugin: (warning) couldn't parse MIME boundary\n";
[2630]527 }
528 # parts start with "--$boundary"
529 # message ends with "--$boundary--"
530 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
[2680]531 # that perl might want to interpolate. Also allows spaces...
[2630]532 $boundary=~s/\\/\\\\/g;
533 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
[2681]534 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
[2630]535 # remove first "part" and last "part" (final --)
536 shift @message_parts;
537 my $last=pop @message_parts;
[2680]538 # if our boundaries are a bit dodgy and we only found 1 part...
539 if (!defined($last)) {$last="";}
[2630]540 # make sure it is only -- and whitespace
541 if ($last !~ /^\-\-\s*$/ms) {
[15872]542 print $outhandle "EmailPlugin: (warning) last part of MIME message isn't empty\n";
[2630]543 }
544 foreach my $message_part (@message_parts) {
545 # remove the leading newline left from split.
546 $message_part=~s/^\r?\n//;
547 }
548 if ($mimetype eq "multipart/alternative") {
549 # check for an HTML version first, then TEXT, otherwise use first.
550 my $part_text="";
551 foreach my $message_part (@message_parts) {
[10839]552 if ($message_part =~ m@^content\-type:\s*text/html@i)
[2630]553 {
554 # Use the HTML version
[10834]555 $part_text = $self->text_from_part($message_part);
[2630]556 $mimetype="text/html";
557 last;
558 }
559 }
560 if ($part_text eq "") { # try getting a text part instead
561 foreach my $message_part (@message_parts) {
[10839]562 if ($message_part =~ m@^content\-type:\s*text/plain@i)
[2630]563 {
564 # Use the plain version
[10839]565 $part_text = $self->text_from_part($message_part);
[2732]566 if ($part_text =~/[^\s]/) {
[3721]567 $part_text = text_into_html($part_text);
[2732]568 }
[2630]569 $mimetype="text/plain";
570 last;
571 }
572 }
573 }
[3721]574 if ($part_text eq "") { #use first part (no html/text part found)
575 $part_text = $self->text_from_part(shift @message_parts);
576 $part_text = text_into_html($part_text);
[2630]577 }
578 if ($part_text eq "") { # we couldn't get anything!!!
579 # or it was an empty message...
580 # do nothing...
[16013]581 gsprintf($outhandle, "{ReadTextFile.empty_file} - empty body?\n");
[2630]582 } else {
[3721]583 $text = $part_text;
[2630]584 }
[3352]585 } elsif ($mimetype =~ m@multipart/(mixed|digest|related|signed)@) {
[3721]586 $text = "";
[3352]587 # signed is for PGP/GPG messages... the last part is a hash
588 if ($mimetype =~ m@multipart/signed@) {
589 pop @message_parts;
590 }
[6062]591 my $is_first_part=1;
[2630]592 foreach my $message_part (@message_parts) {
[6062]593 if ($is_first_part && $text ne "") {$is_first_part=0;}
594
[2630]595 if ($mimetype eq "multipart/digest") {
[6062]596 # default type - RTFRFC!! Set if not already set
597 $message_part =~ m@^(.*)\n\r?\n@s;
598 my $part_header=$1;
599 if ($part_header !~ m@^content-type@mi) {
600 $message_part="Content-type: message/rfc822\n"
601 . $message_part; # prepend default type
[4224]602 }
[2630]603 }
604
[16341]605 $text .= $self->process_multipart_part($default_header_encoding,
606 $message_part,
[6062]607 $is_first_part);
[2630]608 } # foreach message part.
609 } else {
610 # we can't handle this multipart type (not mixed or alternative)
611 # the RFC also mentions "parallel".
612 }
[3627]613 } # end of ($mimetype =~ multipart)
[2918]614 elsif ($mimetype =~ m@message/rfc822@) {
615 my $msg_header = $text;
616 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
617 $text = $1;
618
[3630]619 if ($msg_header =~ /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.+?)\s*$/mi)
[2918]620 {
621 $mimetype=$1;
[9971]622 $mimeinfo=$2;
[2918]623 $mimetype =~ tr/[A-Z]/[a-z]/;
[9971]624
[2918]625 my $msg_text;
626 if ($mimetype =~ m@multipart/@) {
[16341]627 $msg_text = $self->text_from_mime_message($mimetype, $mimeinfo,
628 $default_header_encoding,
[6062]629 $text);
[9823]630 } else {
[9971]631 $msg_text=$self->text_from_part($text,$msg_header);
[9823]632 }
[2918]633
[3630]634 my $brief_header=text_into_html(get_brief_headers($msg_header));
[2918]635 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
636 $text.= "<table><tr><td width=\"5%\"> </td>\n";
637 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
638 . "</td></tr></table>";
639 }
640 } else {
[2886]641 # we don't do any processing of the content.
642 }
643
[2630]644 return $text;
645}
646
647
[6062]648
[7703]649# used for turning a message id into a more friendly string for greenstone
650sub escape_msg_id {
651#msgid
652 my $id=shift;
653 chomp $id; $id =~ s!\s!!g; # remove spaces
654 $id =~ s![<>\[\]]!!g; # remove [ ] < and >
655 $id =~ s![_&]!-!g; # replace symbols that might cause problems
[10834]656 $id =~ s!\.!-!g; # . means section to greenstone doc ids!
[7703]657 $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
658 return $id;
659}
[6062]660
[7703]661
662
[6062]663sub process_multipart_part {
664 my $self = shift;
[16341]665 my $default_header_encoding = shift;
[6062]666 my $message_part = shift;
667 my $is_first_part = shift;
668
669 my $return_text="";
670 my $part_header=$message_part;
671 my $part_body;
672 if ($message_part=~ /^\s*\n/) {
673 # no header... use defaults
674 $part_body=$message_part;
675 $part_header="Content-type: text/plain; charset=us-ascii";
676 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
677 $part_body=$1;
678 } else {
679 # something's gone wrong...
680 $part_header="";
681 $part_body=$message_part;
682 }
683
684 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
685 my $part_content_type="";
686 my $part_content_info="";
687
688 if ($part_header =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;.*)?$@mi) {
689 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
690 $part_content_info=$2;
691 if (!defined($part_content_info)) {
692 $part_content_info="";
693 } else {
694 $part_content_info =~ s/^\;\s*//;
695 $part_content_info =~ s/\s*$//;
696 }
697 }
698 my $filename="";
[6079]699 if ($part_header =~ m@name=\"?([^\"\n]+)\"?@mis) {
[6062]700 $filename=$1;
[6079]701 $filename =~ s@\r?\s*$@@; # remove trailing space, if any
[16341]702 # decode the filename
703 $self->decode_header_value($default_header_encoding, \$filename);
704
[6062]705 }
706
707 # disposition - either inline or attachment.
708 # NOT CURRENTLY USED - we display all text types instead...
709 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
710
711 # add <<attachment>> to each part except the first...
712 if (!$is_first_part) {
713 $return_text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
714 # add part info header
[10827]715 my $header_text = "<br>Type: $part_content_type<br>\n";
[6062]716 if ($filename ne "") {
[10827]717 $header_text .= "Filename: $filename\n";
[6062]718 }
719 $header_text =~ s@_@\\_@g;
[10827]720 $return_text .= $header_text . "</strong></p>\n<p>\n";
[6062]721 }
722
723 if ($part_content_type =~ m@text/@)
724 {
[10827]725 # $message_part includes the mime part headers
726 my $part_text = $self->text_from_part($message_part);
[6062]727 if ($part_content_type !~ m@text/(ht|x)ml@) {
728 $part_text = text_into_html($part_text);
729 }
730 if ($part_text eq "") {
731 $part_text = ' ';
732 }
733 $return_text .= $part_text;
734 } elsif ($part_content_type =~ m@message/rfc822@) {
735 # This is a forwarded message
736 my $message_part_headers=$part_body;
737 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
738 my $message_part_body=$1;
739 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
740
741 my $rfc822_formatted_body=""; # put result in here
742 if ($message_part_headers =~
743 /^content\-type:\s*([\w\.\-\/]+)\s*(\;.*)?$/ims)
744 {
745 # The message header uses MIME flags
746 my $message_content_type=$1;
747 my $message_content_info=$2;
748 if (!defined($message_content_info)) {
749 $message_content_info="";
750 } else {
751 $message_content_info =~ s/^\;\s*//;
752 $message_content_info =~ s/\s*$//;
753 }
754 $message_content_type =~ tr/A-Z/a-z/;
755 if ($message_content_type =~ /multipart/) {
756 $rfc822_formatted_body=
757 $self->text_from_mime_message($message_content_type,
758 $message_content_info,
[16341]759 $default_header_encoding,
[6062]760 $message_part_body);
761 } else {
[9971]762 $message_part_body=$self->text_from_part($part_body,
763 $message_part_headers);
[6062]764 $rfc822_formatted_body=text_into_html($message_part_body);
765 }
766 } else {
767 # message doesn't use MIME flags
768 $rfc822_formatted_body=text_into_html($message_part_body);
769 $rfc822_formatted_body =~ s@_@\\_@g;
770 }
771 # Add the returned text to the output
772 # don't put all the headers...
773# $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
774 my $brief_headers=get_brief_headers($message_part_headers);
775 $return_text.=text_into_html($brief_headers);
776 $return_text.="</p><p>\n";
777 $return_text.=$rfc822_formatted_body;
778 $return_text.="</p>\n";
779 # end of message/rfc822
780 } elsif ($part_content_type =~ /multipart/) {
781 # recurse again
782
783 my $tmptext= $self->text_from_mime_message($part_content_type,
784 $part_content_info,
[16341]785 $default_header_encoding,
[6062]786 $part_body);
787 $return_text.=$tmptext;
788 } else {
789 # this part isn't text/* or another message...
790 if ($is_first_part) {
791 # this is the first part of a multipart, or only part!
792 $return_text="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;";
793 # add part info header
794 my $header_text="<br>Type: $part_content_type<br>\n";
795 $header_text.="Filename: $filename</strong></p>\n<p>\n";
796 $header_text =~ s@_@\\_@g;
797 $return_text.=$header_text;
798 }
799
800 # save attachment by default
[10218]801 if (!$self->{'no_attachments'}
[6062]802 && $filename ne "") { # this part has a file...
803 my $encoding="8bit";
804 if ($part_header =~
805 /^content-transfer-encoding:\s*(\w+)/mi ) {
806 $encoding=$1; $encoding =~ tr/A-Z/a-z/;
807 }
[28563]808 my $tmpdir=&FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
[6062]809 my $save_filename=$filename;
810
811 # make sure we don't clobber files with same name;
812 # need to keep state between .mbx files
813 my $assoc_files=$self->{'assoc_filenames'};
814 if ($assoc_files->{$filename}) { # it's been set...
815 $assoc_files->{$filename}++;
816 $filename =~ m/(.+)\.(\w+)$/;
817 my ($filestem, $ext)=($1,$2);
818 $save_filename="${filestem}_"
819 . $assoc_files->{$filename} . ".$ext";
820 } else { # first file with this name
821 $assoc_files->{$filename}=1;
822 }
[28563]823 my $tmp_filename = &FileUtils::filenameConcatenate($tmpdir, $save_filename);
[16341]824 open (SAVE, ">$tmp_filename") ||
825 warn "EMAILPlug: Can't save attachment as $tmp_filename: $!";
826 binmode(SAVE); # needed on Windows
[6062]827 my $part_text = $message_part;
828 $part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
829 if ($encoding eq "base64") {
830 print SAVE base64_decode($part_text);
831 } elsif ($encoding eq "quoted-printable") {
832 print SAVE qp_decode($part_text);
833 } else { # 7bit, 8bit, binary, etc...
834 print SAVE $part_text;
835 }
836 close SAVE;
837 my $doc_obj=$self->{'doc_obj'};
[16341]838 $doc_obj->associate_file("$tmp_filename",
[6062]839 "$save_filename",
840 $part_content_type # mimetype
841 );
[16341]842 # add this file to the list of tmp files for deleting later
843 push(@{$self->{'tmp_file_paths'}}, $tmp_filename);
844
[6062]845 my $outhandle=$self->{'outhandle'};
[15872]846 print $outhandle "EmailPlugin: saving attachment \"$filename\"\n"; #
[6062]847
848 # be nice if "download" was a translatable macro :(
849 $return_text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
850 } # end of save attachment
851 } # end of !text/message part
852
853
854 return $return_text;
855}
856
857
[3630]858# Return only the "important" headers from a set of message headers
859sub get_brief_headers {
860 my $msg_header = shift;
861 my $brief_header = "";
[2630]862
[3630]863 # Order matters!
864 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
865 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
866 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
867 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
868 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
[2630]869
[3630]870 return $brief_header;
871}
[2630]872
873
874# Process a MIME part. Return "" if we can't decode it.
[6062]875# should only be called for parts with type "text/*" ?
[9971]876# Either pass the entire mime part (including the part's header),
877# or pass the mime part's text and optionally the part's header.
[2630]878sub text_from_part {
[3132]879 my $self = shift;
[3136]880 my $text = shift || '';
[9971]881 my $part_header = shift;
[3136]882
[10834]883
[9971]884 my $type="text/plain"; # default, overridden from part header
[10827]885 my $charset=undef; # convert2unicode() will guess if necessary
[9971]886
887 if (! $part_header) { # no header argument was given. check the body
888 $part_header = $text;
889 # check for empty part header (leading blank line)
890 if ($text =~ /^\s*\r?\n/) {
891 $part_header="Content-type: text/plain; charset=us-ascii";
892 } else {
893 $part_header =~ s/\r?\n\r?\n(.*)$//s;
894 $text=$1; if (!defined($text)) {$text="";}
895 }
896 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
[2681]897 }
[9971]898
899 if ($part_header =~
900 /content\-type:\s*([\w\.\-\/]+).*?charset=\"?([^\;\"\s]+)\"?/is) {
901 $type=$1;
902 $charset=$2;
903 }
[2630]904 my $encoding="";
[2638]905 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
[2630]906 $encoding=$1; $encoding=~tr/A-Z/a-z/;
907 }
908 # Content-Transfer-Encoding is per-part
909 if ($encoding ne "") {
910 if ($encoding =~ /quoted\-printable/) {
911 $text=qp_decode($text);
912 } elsif ($encoding =~ /base64/) {
913 $text=base64_decode($text);
914 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
915 # rfc2045 also allows binary, which we ignore (for now).
[3630]916 my $outhandle=$self->{'outhandle'};
[15872]917 print $outhandle "EmailPlugin: unknown transfer encoding: $encoding\n";
[2630]918 return "";
919 }
920 }
[10834]921
[2630]922 if ($type eq "text/html") {
923 # only get stuff between <body> tags, or <html> tags.
[2730]924 $text =~ s@^.*<html[^>]*>@@is;
925 $text =~ s@</html>.*$@@is;
926 $text =~ s/^.*?<body[^>]*>//si;
927 $text =~ s/<\/body>.*$//si;
[2630]928 }
929 elsif ($type eq "text/xml") {
930 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
931 $text="<pre>\n$text\n</pre>\n";
932 }
[2730]933 # convert to unicode
[2847]934 $self->convert2unicode($charset, \$text);
[6062]935 $text =~ s@_@\\_@g; # protect against GS macro language
[2630]936 return $text;
937}
938
939
[6062]940
941
[2630]942# decode quoted-printable text
943sub qp_decode {
944 my $text=shift;
945
946 # if a line ends with "=\s*", it is a soft line break, otherwise
947 # keep in any newline characters.
948
[3627]949 $text =~ s/=\s*\r?\n//mg;
950 $text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
[2630]951 return $text;
952}
953
954# decode base64 text. This is fairly slow (since it's interpreted perl rather
955# than compiled XS stuff like in the ::MIME modules, but this is more portable
956# for us at least).
957# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
958# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
[2638]959# from each encoded byte.
[2630]960
961
962sub base64_decode {
963 my $enc_text = shift;
964# A=>0, B=>1, ..., '+'=>62, '/'=>63
965# also '=' is used for padding at the end, but we remove it anyway.
966 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
967# map each MIME char into it's value, for more efficient lookup.
968 my %index;
969 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
970# remove all non-base64 chars. eval to get variable in transliteration...
971# also remove '=' - we'll assume (!!) that there are no errors in the encoding
972 eval "\$enc_text =~ tr|$mimechars||cd";
973 my $decoded="";
974 while (length ($enc_text)>3)
975 {
976 my $fourchars=substr($enc_text,0,4,"");
977 my @chars=(split '',$fourchars);
978 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
979 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
980 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
981 }
982# if there are any input chars left, there are either
983# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
984 my @chars=(split '',$enc_text);
985 if (length($enc_text)) {
986 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
987 }
988 if (length($enc_text)==3) {
989 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
990 }
991 return $decoded;
992}
993
[10834]994# returns 0 if valid utf-8, 1 if invalid
995sub is_utf8 {
996 my $self = shift;
997 my $textref = shift;
998
999 $$textref =~ m/^/g; # to set \G
1000 my $badbytesfound=0;
1001 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1002 my $highbytes=$1;
1003 my $highbyteslength=length($highbytes);
1004 # replace any non utf8 complaint bytes
1005 $highbytes =~ /^/g; # set pos()
1006 while ($highbytes =~
1007 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
1008 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
1009 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
1010 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
1011 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1012 )*([\x80-\xff])? !xg
1013 ) {
1014 my $badbyte=$1;
1015 if (!defined $badbyte) {next} # hit end of string
1016 return 1;
1017 }
1018 }
1019 return 0;
1020}
1021
[16341]1022# words with non ascii characters in header values must be encoded in the
1023# following manner =?<charset>?[BQ]?<data>?= (rfc2047)
[10834]1024
[16341]1025sub decode_header_value {
1026 my $self = shift(@_);
1027 my ($default_header_encoding, $textref) = @_;
1028
1029 if (!$$textref) {
1030 # nothing to do!
1031 return;
1032 }
1033 my $value = $$textref;
1034 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
1035 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
1036 my $original_value=$value;
1037 my $encoded=$value;
1038 $value="";
1039 # we should ignore spaces between consecutive encoded-texts
1040 $encoded =~ s@\?=\s+=\?@\?==\?@g;
1041 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=//i) {
1042 my ($charset, $encoding, $data)=($2,$3,$4);
1043 my ($decoded_data);
1044 my $leading_chars = "$1";
1045 $self->convert2unicode($default_header_encoding, \$leading_chars);
1046 $value.=$leading_chars;
1047
1048 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
1049 chomp $data;
1050 $encoding =~ tr/BQ/bq/;
1051 if ($encoding eq "q") { # quoted printable
1052 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
1053 $decoded_data=qp_decode($data);
1054 # qp_decode adds \n, which is default for body text
1055 chomp($decoded_data);
1056 } else { # base 64
1057 $decoded_data=base64_decode($data);
1058 }
1059 $self->convert2unicode($charset, \$decoded_data);
1060 $value .= $decoded_data;
1061 } # end of while loop
1062
1063 # get any trailing characters
1064 $self->convert2unicode($default_header_encoding, \$encoded);
1065 $value.=$encoded;
1066
1067 if ($value =~ /^\s*$/) { # we couldn't extract anything...
1068 $self->convert2unicode($default_header_encoding,
1069 \$original_value);
1070 $value=$original_value;
1071 }
1072 $$textref = $value;
1073 } # end of if =?...?=
1074
1075 # In the absense of other charset information, assume the
1076 # header is the default (usually "iso_8859_1") and convert to unicode.
1077 else {
1078 $self->convert2unicode($default_header_encoding, $textref);
1079 }
1080
1081}
[10834]1082
[16341]1083
1084
[2847]1085sub convert2unicode {
1086 my $self = shift(@_);
1087 my ($charset, $textref) = @_;
[2630]1088
[7703]1089 if (!$$textref) {
1090 # nothing to do!
1091 return;
1092 }
1093
[10827]1094 if (! defined $charset) {
1095 # check if we have valid utf-8
[10834]1096 if ($self->is_utf8($textref)) { $charset = "utf8" }
[10827]1097
1098 # default to latin
1099 $charset = "iso_8859_1" if ! defined($charset);
1100 }
1101
[2847]1102 # first get our character encoding name in the right form.
[7703]1103 $charset =~ tr/A-Z/a-z/; # lowercase
1104 $charset =~ s/\-/_/g;
[8902]1105 if ($charset =~ /gb_?2312/) { $charset="gb" }
[2847]1106 # assumes EUC-KR, not ISO-2022 !?
[7703]1107 $charset =~ s/^ks_c_5601_1987/korean/;
1108 if ($charset eq 'utf_8') {$charset='utf8'}
[2847]1109
[7703]1110 my $outhandle = $self->{'outhandle'};
1111
1112 if ($charset eq "utf8") {
1113 # no conversion needed, but lets check that it's valid utf8
1114 # see utf-8 manpage for valid ranges
1115 $$textref =~ m/^/g; # to set \G
1116 my $badbytesfound=0;
1117 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
1118 my $highbytes=$1;
1119 my $highbyteslength=length($highbytes);
1120 # replace any non utf8 complaint bytes
1121 $highbytes =~ /^/g; # set pos()
1122 while ($highbytes =~
[8904]1123 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
[7703]1124 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
[8904]1125 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
1126 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
[7703]1127 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte
1128 )*([\x80-\xff])? !xg
1129 ) {
1130 my $badbyte=$1;
1131 if (!defined $badbyte) {next} # hit end of string
1132 my $pos=pos($highbytes);
1133 substr($highbytes, $pos-1, 1, "\xc2\x80");
1134 # update the position to continue searching (for \G)
1135 pos($highbytes) = $pos+1; # set to just after the \x80
1136 $badbytesfound=1;
1137 }
1138 if ($badbytesfound==1) {
1139 # claims to be utf8, but it isn't!
[15872]1140 print $outhandle "EmailPlugin: Headers claim utf-8 but bad bytes "
[7703]1141 . "detected and removed.\n";
1142
1143 my $replength=length($highbytes);
1144 my $textpos=pos($$textref);
1145 # replace bad bytes with good bytes
1146 substr( $$textref, $textpos-$replength,
1147 $replength, $highbytes);
1148 # update the position to continue searching (for \G)
1149 pos($$textref)=$textpos+($replength-$highbyteslength);
1150 }
1151 }
[3206]1152 return;
1153 }
1154
[2847]1155 # It appears that we can't always trust ascii text so we'll treat it
1156 # as iso-8859-1 (letting characters above 0x80 through without
1157 # converting them to utf-8 will result in invalid XML documents
1158 # which can't be parsed at build time).
1159 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
1160
[3073]1161 if ($charset eq "iso_8859_1") {
1162 # test if the mailer lied, and it has win1252 chars in it...
1163 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
1164 if ($$textref =~ m/[\x80-\x9f]/) {
[15872]1165 print $outhandle "EmailPlugin: Headers claim ISO charset but MS ";
[3073]1166 print $outhandle "codepage 1252 detected.\n";
1167 $charset = "windows_1252";
1168 }
1169 }
[3351]1170 my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
1171
1172 if ($utf8_text ne "") {
1173 $$textref=$utf8_text;
1174 } else {
1175 # we didn't get any text... unsupported encoding perhaps? Or it is
1176 # empty anyway. We'll try to continue, assuming 8859-1. We could strip
1177 # characters out here if this causes problems...
[3726]1178 my $outhandle=$self->{'outhandle'};
[15872]1179 print $outhandle "EmailPlugin: falling back to iso-8859-1\n";
[3351]1180 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
1181
1182 }
[2847]1183}
1184
[17026]1185sub get_base_OID {
1186 my $self = shift(@_);
1187 my ($doc_obj) = @_;
[2847]1188
[17026]1189 if ($self->{'OIDtype'} eq "message_id") {
1190 # temporarily set OIDtype to hash to get a base id
1191 $self->{'OIDtype'} = "hash_on_ga_xml";
[19282]1192 my $id = $self->SUPER::get_base_OID(@_);
[17026]1193 $self->{'OIDtype'} = "message_id";
1194 return $id;
1195 }
1196 return $self->SUPER::get_base_OID(@_);
1197}
1198
1199
1200sub add_OID {
[7703]1201 my $self = shift (@_);
1202 my ($doc_obj, $id, $segment_number) = @_;
[17026]1203 if ($self->{'OIDtype'} eq "message_id" && exists $doc_obj->{'msgid'} ) {
[7703]1204 $doc_obj->set_OID($doc_obj->{'msgid'});
[17026]1205 }
1206 else {
[7703]1207 $doc_obj->set_OID("$id\_$segment_number");
1208 }
1209}
1210
1211
[1206]1212# Perl packages have to return true if they are run.
12131;
Note: See TracBrowser for help on using the repository browser.