Changeset 2730 for trunk/gsdl/perllib/plugins/EMAILPlug.pm
- Timestamp:
- 2001-09-03T15:29:45+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/EMAILPlug.pm
r2717 r2730 39 39 # after the first blank line in the document. 40 40 # 41 # Metadata :41 # Metadata (not Dublin Core!): 42 42 # $Headers All the header content 43 43 # $Subject Subject: header 44 44 # $To To: header 45 # $From From: header - this will be stored as Creator45 # $From From: header 46 46 # $FromName Name of sender (where available) 47 47 # $FromAddr E-mail address of sender … … 57 57 # * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1 58 58 # * RFC 2046 - MIME (part 2) Media Types (and multipart messages) 59 # * RFC 2047 - MIME (part 3) Message Header Extensions 59 60 # * RFC 1806 - Content Dispositions (ie inline/attachment) 60 61 package EMAILPlug; 61 62 62 63 use SplitPlug; 64 65 use unicode; 63 66 64 67 use sorttools; … … 79 82 my ($class) = @_; 80 83 my $self = new BasPlug ("EMAILPlug", @_); 81 # make sure we don't run textcat (defaults to "auto"); 82 $self->{'input_encoding'}="iso_8859_1"; # this might not be good enough... 84 # this might not actually be true at read-time, but after processing 85 # it should all be utf8. 86 $self->{'input_encoding'}="utf8"; 83 87 return bless $self, $class; 84 88 } … … 128 132 $Headers =~ s/\r?\n\r?\n(.*)$//s; 129 133 $$textref = $1; 130 # escape [] so it isn't re-interpreted as metadata131 $Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g;132 133 134 134 135 # Unfold headers - see rfc822 … … 163 164 $value =~ s/\s+$//; 164 165 166 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047) 167 if ($value =~ /=\?/) { 168 my $original_value=$value; 169 my $encoded=$value; 170 $value=""; 171 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) { 172 my ($charset, $encoding, $data)=($2,$3,$4); 173 my $decoded_data; 174 $value.="$1"; # any leading chars 175 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends 176 chomp $data; 177 $encoding =~ tr/BQ/bq/; 178 if ($encoding eq "q") { # quoted printable 179 $decoded_data=qp_decode($data); 180 } else { # base 64 181 $decoded_data=base64_decode($data); 182 } 183 if (defined($charset)) { 184 $charset=~tr/A-Z/a-z/; 185 $charset=~s/\-/_/g; 186 $charset=~s/gb2312/gb/; 187 # assumes EUC-KR, not ISO-2022 !? 188 $charset=~s/ks_c_5601_1987/korean/; 189 } else {$charset="ascii";} 190 if ($charset eq "ascii" || $charset eq "us-ascii") { 191 # technically possible to have this explicitly... 192 $value.=$decoded_data; 193 } else { 194 my $utf8_text=&unicode::unicode2utf8 195 ( 196 &unicode::convert2unicode($charset,\$decoded_data) 197 ); 198 $value.=$utf8_text; 199 } 200 } # end of while loop 201 $value.=$encoded; # get any trailing characters 202 if ($value =~ /^\s*$/) { # we couldn't extract anything... 203 $value=original_value; 204 } 205 } # end of if =?...?= 206 165 207 # Store the metadata 166 208 $raw{$name} = $value; … … 201 243 202 244 # Escape < and > in the whole From field; 203 $frommeta =~ s/</</g; $frommeta =~ s/>/>/g;204 245 $raw{"From"}=$frommeta; 205 246 … … 220 261 } 221 262 222 223 263 # Add extracted metadata to document object 224 264 foreach my $name (keys %raw) { 225 265 $value = $raw{$name}; 226 266 if ($value) { 267 # assume subject, etc headers have no special HTML meaning. 268 $value =~ s@&@&\;@g; 269 $value =~ s/</</g; $value =~ s/>/>/g; 227 270 $value = &text_into_html($value); 271 # escape [] so it isn't re-interpreted as metadata 272 $value =~ s/\[/[/g; $value =~ s/\]/]/g; 228 273 } else { 229 274 $value = "No $name field"; … … 260 305 $Headers = "No headers" unless ($Headers =~ /\w/); 261 306 $Headers =~ s/@/@\;/g; 307 # escape [] so it isn't re-interpreted as metadata 308 $Headers =~ s/\[/[/g; $Headers =~ s/\]/]/g; 309 262 310 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers); 263 311 … … 298 346 # $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 299 347 300 # assume hostnames are \.\w\ d\- only, then might have a trailing '/.*'301 # URI can't finish with a '.'302 $text =~ s /((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g;348 # assume hostnames are \.\w\- only, then might have a trailing '/.*' 349 # assume URI doesn't finish with a '.' 350 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g; 303 351 304 352 … … 419 467 # add <<attachment>> to each part except the first... 420 468 if ($text ne "") { 421 $text.=" <p><hr><strong><<attachment>></>";469 $text.="\n<p><hr><strong><<attachment>></>"; 422 470 # add part info header 423 471 $text.="<br>Type: $part_content_type<br>\n"; … … 488 536 # part then make sure it is mentioned.. 489 537 490 $text.=" <p><hr><strong><<attachment>></>";538 $text.="\n<p><hr><strong><<attachment>></>"; 491 539 # add part info header 492 540 $text.="<br>Type: $part_content_type<br>\n"; … … 522 570 } 523 571 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold 524 $part_header =~ /content\-type:\s*([\w\/]+)/is; 525 my $type=$1; if (!defined($type)) {$type="";} 572 $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is; 573 my $type=$1; 574 my $charset=$2; 575 if (!defined($type)) {$type="";} 576 if (!defined($charset)) {$charset="ascii";} 526 577 my $encoding=""; 527 578 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) { … … 541 592 } 542 593 } 543 544 594 if ($type eq "text/html") { 545 595 # only get stuff between <body> tags, or <html> tags. 546 $text =~ s/^.*?<(html|HTML)[^>]*>//s; 547 $text =~ s/<\/(html|HTML)>.*$//s; 548 549 $text =~ s/^.*?<(body|BODY)[^>]*>//s; 550 $text =~ s/<\/(body|BODY)>.*$//s; 596 $text =~ s@^.*<html[^>]*>@@is; 597 $text =~ s@</html>.*$@@is; 598 $text =~ s/^.*?<body[^>]*>//si; 599 $text =~ s/<\/body>.*$//si; 551 600 } 552 601 elsif ($type eq "text/xml") { 553 602 $text=~s/</</g;$text=~s/>/>/g; 554 603 $text="<pre>\n$text\n</pre>\n"; 604 } 605 # convert to unicode 606 # first get our character encoding name in the right form. 607 $charset=~tr/A-Z/a-z/; 608 $charset=~s/\-/_/g; 609 if ($charset ne "us_ascii" && $charset ne "ascii") { 610 $charset=~s/gb2312/gb/; 611 # assumes EUC-KR, not ISO-2022 !? 612 $charset=~s/ks_c_5601_1987/korean/; 613 my @unicode_array=&unicode::convert2unicode($charset,\$text); 614 my $utf8_text=&unicode::unicode2utf8(@unicode_array); 615 $text=$utf8_text; 555 616 } 556 617 return $text;
Note:
See TracChangeset
for help on using the changeset viewer.