Changeset 7703
- Timestamp:
- 2004-07-05T17:52:01+12:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/EMAILPlug.pm
r6916 r7703 358 358 } 359 359 360 361 # extract a message ID from the headers, if there is one, and we'll use 362 # that as the greenstone doc ID. Having a predictable ID means we can 363 # link to other messages, eg from In-Reply-To or References headers... 364 if ($Headers =~ m@^Message-ID:(.+)$@mi) { 365 my $id=escape_msg_id($1); 366 $doc_obj->{'msgid'}=$id; 367 } 368 # link to another message, if this is a reply 369 if ($Headers =~ m@^In-Reply-To:(.+)$@mi) { 370 my $id=escape_msg_id($1); 371 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id); 372 } elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) { 373 # References can have multiple, get the last one 374 my $id=escape_msg_id($1); 375 # not necessarily in-reply-to, but same thread... 376 $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id); 377 } 378 379 380 360 381 my $mimetype="text/plain"; 361 382 my $mimeinfo=""; … … 425 446 # Add Title metadata 426 447 my $Title = text_into_html($raw{'Subject'}); 427 $Title .= "<br>From: " . text_into_html($ raw{'From'});448 $Title .= "<br>From: " . text_into_html($fromnamemeta); 428 449 $Title .= "<br>Date: " . text_into_html($raw{'DateText'}); 429 450 $Title =~ s/\[/[/g; $Title =~ s/\]/]/g; … … 620 641 } 621 642 643 644 645 # used for turning a message id into a more friendly string for greenstone 646 sub escape_msg_id { 647 #msgid 648 my $id=shift; 649 chomp $id; $id =~ s!\s!!g; # remove spaces 650 $id =~ s![<>\[\]]!!g; # remove [ ] < and > 651 $id =~ s![_&]!-!g; # replace symbols that might cause problems 652 $id =~ s!@!-!g; # replace @ symbol, to avoid spambots 653 return $id; 654 } 622 655 623 656 … … 938 971 my ($charset, $textref) = @_; 939 972 973 if (!$$textref) { 974 # nothing to do! 975 return; 976 } 977 940 978 # first get our character encoding name in the right form. 941 979 $charset = "iso_8859_1" unless defined $charset; 942 $charset =~tr/A-Z/a-z/;943 $charset =~s/\-/_/g;944 $charset =~s/gb2312/gb/;980 $charset =~ tr/A-Z/a-z/; # lowercase 981 $charset =~ s/\-/_/g; 982 $charset =~ s/gb2312/gb/; 945 983 # assumes EUC-KR, not ISO-2022 !? 946 $charset=~s/ks_c_5601_1987/korean/; 947 948 if ($charset eq "utf_8" || !$$textref) { 949 # nothing to do! 984 $charset =~ s/^ks_c_5601_1987/korean/; 985 if ($charset eq 'utf_8') {$charset='utf8'} 986 987 my $outhandle = $self->{'outhandle'}; 988 989 if ($charset eq "utf8") { 990 # no conversion needed, but lets check that it's valid utf8 991 # see utf-8 manpage for valid ranges 992 $$textref =~ m/^/g; # to set \G 993 my $badbytesfound=0; 994 while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) { 995 my $highbytes=$1; 996 my $highbyteslength=length($highbytes); 997 # replace any non utf8 complaint bytes 998 $highbytes =~ /^/g; # set pos() 999 while ($highbytes =~ 1000 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8 1001 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte 1002 [\xf0-\xf7][\x80-\xbf]{3} # 4 byte 1003 [\xf8-\xfb][\x80-\xbf]{4} # 5 byte 1004 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte 1005 )*([\x80-\xff])? !xg 1006 ) { 1007 my $badbyte=$1; 1008 if (!defined $badbyte) {next} # hit end of string 1009 my $pos=pos($highbytes); 1010 substr($highbytes, $pos-1, 1, "\xc2\x80"); 1011 # update the position to continue searching (for \G) 1012 pos($highbytes) = $pos+1; # set to just after the \x80 1013 $badbytesfound=1; 1014 } 1015 if ($badbytesfound==1) { 1016 # claims to be utf8, but it isn't! 1017 print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes " 1018 . "detected and removed.\n"; 1019 1020 my $replength=length($highbytes); 1021 my $textpos=pos($$textref); 1022 # replace bad bytes with good bytes 1023 substr( $$textref, $textpos-$replength, 1024 $replength, $highbytes); 1025 # update the position to continue searching (for \G) 1026 pos($$textref)=$textpos+($replength-$highbyteslength); 1027 } 1028 } 950 1029 return; 951 1030 } … … 961 1040 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't 962 1041 if ($$textref =~ m/[\x80-\x9f]/) { 963 my $outhandle = $self->{'outhandle'};964 1042 print $outhandle "EMAILPlug: Headers claim ISO charset but MS "; 965 1043 print $outhandle "codepage 1252 detected.\n"; … … 983 1061 984 1062 1063 sub set_OID { 1064 my $self = shift (@_); 1065 my ($doc_obj, $id, $segment_number) = @_; 1066 1067 if ( exists $doc_obj->{'msgid'} ) { 1068 $doc_obj->set_OID($doc_obj->{'msgid'}); 1069 } else { 1070 $doc_obj->set_OID("$id\_$segment_number"); 1071 } 1072 } 1073 1074 985 1075 # Perl packages have to return true if they are run. 986 1076 1;
Note:
See TracChangeset
for help on using the changeset viewer.