- Timestamp:
- 2000-07-13T10:21:53+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/New_Config_Format-branch/gsdl/perllib/plugins/EMAILPlug.pm
r638 r1279 26 26 27 27 28 # 29 # EMAILPlug reads an email file (*.email) 30 # 31 # Version 1.1 1999 Sep 20 by Gordon Paynter ([email protected]) 32 # loosely based on the original HTMLPlug code 28 29 # EMAILPlug 30 # 31 # by Gordon Paynter ([email protected]) 32 # 33 # Email plug reads email files. These are named with a simple 34 # number (i.e. as they appear in mh_mail folders) or with the 35 # extension .email 33 36 # 34 37 # Document text: 35 # The document text consists of all the text occuring after the first36 # blank line in thisdocument.38 # The document text consists of all the text 39 # after the first blank line in the document. 37 40 # 38 41 # Metadata: 42 # $Headers All the header content 39 43 # $Subject Subject: header 40 44 # $To To: header … … 42 46 # $DateText Date: header 43 47 # $Date Date: header in GSDL format (eg: 19990924) 44 # $OtherHeaders All the other headers 45 # $NewText The unquoted text in this message 48 # 49 # Version history 50 # 51 # 1.2 (2000 Jun 12) Major rewrite. 52 # (The new version of Greenstone breaks some of the metadata.) 53 # 1.1.1 Compensated for two-digit years like "95" 54 # 1.1 (1999 Sep 20) Introduced the various metadata fileds 55 # 1.0 Based on the original HTMLPlug code 46 56 # 47 57 … … 56 66 # EMAILPlug is a sub-class of BasPlug. 57 67 58 sub BEGIN { 68 sub BEGIN { 59 69 @ISA = ('BasPlug'); 60 70 } 61 71 72 use strict; 62 73 63 74 # Create a new EMAILPlug object with which to parse a file. 64 # This is done by creating a new BasPlug and usig bless to75 # Accomplished by creating a new BasPlug and using bless to 65 76 # turn it into an EMAILPlug. 66 77 67 78 sub new { 68 79 my ($class) = @_; 69 $self = new BasPlug ();80 my $self = new BasPlug ("EMAILPlug", @_); 70 81 71 82 return bless $self, $class; 72 83 } 73 84 74 75 # Is the EMAILPlug recursive? No. 76 77 sub is_recursive { 85 sub get_default_process_exp { 78 86 my $self = shift (@_); 79 87 80 return 0; # this is not a recursive plugin 81 } 82 83 84 # 85 # read 86 # 87 # read attempts to read a file and store its contents in a 88 # new document object. 89 # 90 # Returns: number of files processed or undef if can't process 91 # This plugin only processes one file at a time. 92 # 93 # Note: $base_dir might be "" and $file might include directories, 94 # but that doesn't affect EMAILPlug 95 # 96 97 sub read { 88 return q^\d+(\.email)?$^; 89 } 90 91 # do plugin specific processing of doc_obj 92 sub process { 98 93 my $self = shift (@_); 99 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 100 101 # Make sure file exists and is an email file 102 my $filename = &util::filename_cat($base_dir, $file); 103 return undef unless ($filename =~ /\.email$/i && (-e $filename)); 104 105 print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'}; 106 107 # create a new document object 108 my $doc_obj = new doc ($file, "indexed_doc"); 109 open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n"; 94 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 95 96 # Check that we're dealing with a valid mail file 97 return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/)); 98 99 print STDERR "EMAILPlug: processing $file\n" 100 if $self->{'verbosity'} > 1; 101 110 102 my $cursection = $doc_obj->get_top_section(); 111 103 112 # Metadata fields 113 my $Subject = ""; 114 my $To = ""; 115 my $From = ""; 116 my $DateText = ""; 117 my $Date = ""; 118 my $OtherHeaders = ""; 119 my $NewText = ""; 120 my $text = ""; 121 my $line = ""; 122 123 my $headers_read = 0; 124 125 # Read and process each line in te email file. 126 # Each file consists of a set of header lines, then a blank line, 127 # then the body of the email. 128 while (<FILE>) { 104 # 105 # Parse the document's text and extract metadata 106 # 107 108 # Separate header from body of message 109 my $Headers = $$textref; 110 $Headers =~ s/\n\n.*//s; 111 $$textref = substr $$textref, (length $Headers); 112 113 # Extract basic metadata from header 114 my @headers = ("From", "To", "Subject", "Date"); 115 my $value = ""; 116 my %raw; 117 118 foreach my $name (@headers) { 119 $value = $Headers; 120 $value =~ s/.*$name://s; 121 $value =~ s/\S*:.*//s; 122 $value =~ s/\s*$//; 123 $value =~ s/\s+/ /g; 124 $raw{$name} = $value; 125 } 126 127 # Process Date information 128 if ($raw{"Date"}) { 129 $raw{"DateText"} = $raw{"Date"}; 129 130 130 $line = $_; 131 # Convert the date text to internal date format 132 $value = $raw{"Date"}; 133 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/; 134 if ($year < 100) { $year += 1900; } 135 $raw{"Date"} = &sorttools::format_date($day, $month, $year); 131 136 132 # Remove carriage returns from the line. 133 # We will later replace single cariage returns with <BR> tags 134 # and double carriage returns with <P> tags. 135 $line =~ s/\n/ /g; 136 137 if ($headers_read) { 138 # The headers have been read, so add this line to the body text 139 $text .= "$line\n"; 140 # If the line isn't quoted, add it to the NewText metadata 141 if ($line =~ /^[^>|]/) { 142 $NewText .= "$line\n"; 143 } 144 145 } elsif ($line =~ /^\s*$/) { 146 # An empty line signals the end of the headers. 147 $headers_read = 1; 148 137 } else { 138 # We have not extracted a date 139 $raw{"DateText"} = "Unknown."; 140 $raw{"Date"} = "19000000"; 141 } 142 143 144 # Add extracted metadata to document object 145 foreach my $name (keys %raw) { 146 $value = $raw{$name}; 147 if ($value) { 148 $value = &text_into_html($value); 149 149 } else { 150 # Read a line of header information and add it to the metadata 151 $line .= "\n"; 152 if ($line =~ /^From:/) { 153 $line =~ s/^From:\s*//; 154 $From .= $line; 155 } elsif ($line =~ /^To:/) { 156 $line =~ s/^To:\s*//; 157 $To .= $line; 158 } elsif ($line =~ /^Date:/) { 159 $line =~ s/^Date:\s*//; 160 $DateText .= $line; 161 if ($Date !~ /\d+/) { 162 # Convert the date text to internal date format 163 my ($day, $month, $year) = $line =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d\d)/; 164 $Date = &sorttools::format_date($day, $month, $year); 165 } 166 } elsif ($line =~ /^Subject:/) { 167 $line =~ s/^Subject:\s*//; 168 $Subject .= $line; 169 } else { 170 $OtherHeaders .= $line; 171 } 172 } 150 $value = "No $name field"; 151 } 152 $doc_obj->add_utf8_metadata ($cursection, $name, $value); 173 153 } 174 154 175 # Add Subject metadata 176 $Subject = &text_into_html($Subject); 177 $Subject = "No Subject" unless ($Subject =~ /\w/); 178 $doc_obj->add_metadata ($cursection, "Subject", $Subject); 179 180 # Add Sender 181 $From = &text_into_html($From); 182 $From = "No Sender" unless ($From =~ /\w/); 183 $doc_obj->add_metadata ($cursection, "Creator", $From); 184 185 # Add Recipient 186 $To = &text_into_html($To); 187 $To = "No Recipient" unless ($To =~ /\w/); 188 $doc_obj->add_metadata ($cursection, "To", $To); 189 190 # Add Date Text 191 $DateText =~ &text_into_html($Date); 192 $doc_obj->add_metadata ($cursection, "DateText", $DateText) if ($DateText =~ /\w/); 193 194 # Add Date 195 $Date =~ &text_into_html($Date); 196 $doc_obj->add_metadata ($cursection, "Date", $Date) if ($Date =~ /\w/); 197 198 # Add Other Headers 199 $OtherHeaders = &text_into_html($OtherHeaders); 200 $doc_obj->add_metadata ($cursection, "OtherHeaders", $OtherHeaders) if ($OtherHeaders =~ /\w/); 201 202 # Add New Text 203 $NewText = &text_into_html($NewText); 204 $doc_obj->add_metadata ($cursection, "NewText", $NewText) if ($NewText =~ /\w/); 205 206 # Add text 207 $text =~ s/<BR>\s*<BR>/<P>/g; 208 $text = &text_into_html($text); 209 $doc_obj->add_text ($cursection, $text) if ($text =~ /\w/); 210 211 # Add the OID - that is, the big HASH value used as a unique ID 212 $doc_obj->set_OID (); 213 214 # Process the document 215 $processor->process($doc_obj); 216 217 return 1; # processed the file 218 } 219 220 221 1; 222 223 224 225 # 155 # Add "All headers" metadata 156 $Headers = &text_into_html($Headers); 157 $Headers = "No headers" unless ($Headers =~ /\w/); 158 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers); 159 160 # Add text to document object 161 $$textref = &text_into_html($$textref); 162 $$textref = "No message" unless ($$textref =~ /\w/); 163 $doc_obj->add_utf8_text($cursection, $$textref); 164 165 return 1; 166 } 167 168 226 169 # Convert a text string into HTML. 227 170 # … … 234 177 # and replaces carriage returns with <BR> tags (and multiple carriage 235 178 # returns with <P> tags). 236 # 179 237 180 238 181 sub text_into_html { 239 182 my ($text) = @_; 240 183 241 242 # Convert problem charaters into HTML symbols 243 $text =~ s/&/&/g; 244 $text =~ s/</</g; 245 $text =~ s/>/>/g; 246 $text =~ s/\"/"/g; 184 # Convert problem characters into HTML symbols 185 $text =~ s/&/&/go; 186 $text =~ s/</</go; 187 $text =~ s/>/>/go; 188 $text =~ s/\"/"/go; 247 189 248 190 # convert email addresses and URLs into links 249 191 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 250 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\- ]*)/<a href=\"$1">$1<\/a>/g;192 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g; 251 193 252 194 # Clean up whitespace and convert \n charaters to <BR> or <P> 253 $text =~ s/ +/ /g ;254 $text =~ s/\s*$// ;255 $text =~ s/^\s*// ;256 $text =~ s/\n/\n<BR>/g ;257 $text =~ s/<BR>\s*<BR>/<P>/g ;195 $text =~ s/ +/ /go; 196 $text =~ s/\s*$//o; 197 $text =~ s/^\s*//o; 198 $text =~ s/\n/\n<BR>/go; 199 $text =~ s/<BR>\s*<BR>/<P>/go; 258 200 259 201 return $text; 260 202 } 261 203 262 263 264 265 266 267 268 204 205 # Perl packages have to return true if they are run. 206 1;
Note:
See TracChangeset
for help on using the changeset viewer.