Changeset 1206
- Timestamp:
- 2000-06-13T09:50:15+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/EMAILPlug.pm
r638 r1206 26 26 27 27 28 # 29 # EMAILPlug reads an email file (*.email) 30 # 31 # Version 1.1 1999 Sep 20 by Gordon Paynter ([email protected]) 32 # loosely based on the original HTMLPlug code 28 29 # EMAILPlug 30 # 31 # by Gordon Paynter ([email protected]) 32 # 33 # Email plug reads email files. These are named with a simple 34 # number (i.e. as they appear in mh_mail folders) or with the 35 # extension .email 33 36 # 34 37 # Document text: 35 # The document text consists of all the text occuring after the first36 # blank line in thisdocument.38 # The document text consists of all the text 39 # after the first blank line in the document. 37 40 # 38 41 # Metadata: 42 # $Headers All the header content 39 43 # $Subject Subject: header 40 44 # $To To: header … … 42 46 # $DateText Date: header 43 47 # $Date Date: header in GSDL format (eg: 19990924) 44 # $OtherHeaders All the other headers 45 # $NewText The unquoted text in this message 48 # 49 # Version history 50 # 51 # 1.2 (2000 Jun 12) Major rewrite. 52 # (The new version of Greenstone breaks some of the metadata.) 53 # 1.1.1 Compensated for two-digit years like "95" 54 # 1.1 (1999 Sep 20) Introduced the various metadata fileds 55 # 1.0 Based on the original HTMLPlug code 46 56 # 47 57 … … 56 66 # EMAILPlug is a sub-class of BasPlug. 57 67 58 sub BEGIN { 68 sub BEGIN { 59 69 @ISA = ('BasPlug'); 60 70 } … … 62 72 63 73 # Create a new EMAILPlug object with which to parse a file. 64 # This is done by creating a new BasPlug and usig bless to74 # Accomplished by creating a new BasPlug and using bless to 65 75 # turn it into an EMAILPlug. 66 76 … … 68 78 my ($class) = @_; 69 79 $self = new BasPlug (); 70 71 80 return bless $self, $class; 72 81 } 73 82 74 83 75 # Is theEMAILPlug recursive? No.84 # Is EMAILPlug recursive? No. 76 85 77 86 sub is_recursive { 78 my $self = shift (@_); 79 80 return 0; # this is not a recursive plugin 81 } 82 83 84 # 85 # read 86 # 87 # read attempts to read a file and store its contents in a 88 # new document object. 89 # 90 # Returns: number of files processed or undef if can't process 91 # This plugin only processes one file at a time. 92 # 93 # Note: $base_dir might be "" and $file might include directories, 94 # but that doesn't affect EMAILPlug 95 # 87 return 0; 88 } 89 90 91 # Read a file and store its contents in a new document object. 92 # First, we check to see if it is an email message we're dealing 93 # with, then we extract the text and metadata, then we store 94 # all this information. 95 # 96 # Returns: number of files processed or undef if it can't process 97 # a file. This plugin only processes one file at a time. 96 98 97 99 sub read { … … 99 101 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 100 102 101 # Make sure file exists and is an email file 103 # 104 # Check that we're dealig with a valid mail file 105 # 106 107 # Make sure file exists 102 108 my $filename = &util::filename_cat($base_dir, $file); 103 return undef unless ($filename =~ /\.email$/i && (-e $filename)); 109 return undef unless (-e $filename); 110 return undef unless ($filename =~ /\d+(\.email)?$/); 111 112 # Read the text and make sure it is an email message 113 open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n"; 114 my @text = <FILE>; 115 my $text = join("", @text); 116 return undef unless (($text =~ /From:/) || ($text =~ /To:/)); 104 117 105 118 print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'}; 106 119 107 # create a new document object 120 # 121 # Parse the document's text and extract metadata 122 # 123 124 # Separate header from body of message 125 my $Headers = $text; 126 $Headers =~ s/\n\n.*//s; 127 $text = substr $text, (length $Headers); 128 129 # Extract basic metadata from header 130 my @headers = ("From", "To", "Subject", "Date"); 131 my $value = ""; 132 my %raw; 133 134 foreach my $name (@headers) { 135 $value = $Headers; 136 $value =~ s/.*$name://s; 137 $value =~ s/\S*:.*//s; 138 $value =~ s/\s*$//; 139 $value =~ s/\s+/ /g; 140 $raw{$name} = $value; 141 } 142 143 # Process Date information 144 if ($raw{"Date"}) { 145 $raw{"DateText"} = $raw{"Date"}; 146 147 # Convert the date text to internal date format 148 $value = $raw{"Date"}; 149 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/; 150 if ($year < 100) { $year += 1900; } 151 $raw{"Date"} = &sorttools::format_date($day, $month, $year); 152 153 } else { 154 # We have not extracted a date 155 $raw{"DateText"} = "Unknown."; 156 $raw{"Date"} = "19000000"; 157 } 158 159 160 # 161 # Create a new document object 162 # 163 108 164 my $doc_obj = new doc ($file, "indexed_doc"); 109 open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";110 165 my $cursection = $doc_obj->get_top_section(); 111 166 112 # Metadata fields 113 my $Subject = ""; 114 my $To = ""; 115 my $From = ""; 116 my $DateText = ""; 117 my $Date = ""; 118 my $OtherHeaders = ""; 119 my $NewText = ""; 120 my $text = ""; 121 my $line = ""; 122 123 my $headers_read = 0; 124 125 # Read and process each line in te email file. 126 # Each file consists of a set of header lines, then a blank line, 127 # then the body of the email. 128 while (<FILE>) { 129 130 $line = $_; 131 132 # Remove carriage returns from the line. 133 # We will later replace single cariage returns with <BR> tags 134 # and double carriage returns with <P> tags. 135 $line =~ s/\n/ /g; 136 137 if ($headers_read) { 138 # The headers have been read, so add this line to the body text 139 $text .= "$line\n"; 140 # If the line isn't quoted, add it to the NewText metadata 141 if ($line =~ /^[^>|]/) { 142 $NewText .= "$line\n"; 143 } 144 145 } elsif ($line =~ /^\s*$/) { 146 # An empty line signals the end of the headers. 147 $headers_read = 1; 148 167 # Add specilised metadata 168 foreach my $name (keys %raw) { 169 $value = $raw{$name}; 170 if ($value) { 171 $value = &text_into_html($value); 149 172 } else { 150 # Read a line of header information and add it to the metadata 151 $line .= "\n"; 152 if ($line =~ /^From:/) { 153 $line =~ s/^From:\s*//; 154 $From .= $line; 155 } elsif ($line =~ /^To:/) { 156 $line =~ s/^To:\s*//; 157 $To .= $line; 158 } elsif ($line =~ /^Date:/) { 159 $line =~ s/^Date:\s*//; 160 $DateText .= $line; 161 if ($Date !~ /\d+/) { 162 # Convert the date text to internal date format 163 my ($day, $month, $year) = $line =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d\d)/; 164 $Date = &sorttools::format_date($day, $month, $year); 165 } 166 } elsif ($line =~ /^Subject:/) { 167 $line =~ s/^Subject:\s*//; 168 $Subject .= $line; 169 } else { 170 $OtherHeaders .= $line; 171 } 172 } 173 $value = "No $name field"; 174 } 175 $doc_obj->add_metadata ($cursection, $name, $value); 173 176 } 174 177 175 # Add Subject metadata 176 $Subject = &text_into_html($Subject); 177 $Subject = "No Subject" unless ($Subject =~ /\w/); 178 $doc_obj->add_metadata ($cursection, "Subject", $Subject); 179 180 # Add Sender 181 $From = &text_into_html($From); 182 $From = "No Sender" unless ($From =~ /\w/); 183 $doc_obj->add_metadata ($cursection, "Creator", $From); 184 185 # Add Recipient 186 $To = &text_into_html($To); 187 $To = "No Recipient" unless ($To =~ /\w/); 188 $doc_obj->add_metadata ($cursection, "To", $To); 189 190 # Add Date Text 191 $DateText =~ &text_into_html($Date); 192 $doc_obj->add_metadata ($cursection, "DateText", $DateText) if ($DateText =~ /\w/); 193 194 # Add Date 195 $Date =~ &text_into_html($Date); 196 $doc_obj->add_metadata ($cursection, "Date", $Date) if ($Date =~ /\w/); 197 198 # Add Other Headers 199 $OtherHeaders = &text_into_html($OtherHeaders); 200 $doc_obj->add_metadata ($cursection, "OtherHeaders", $OtherHeaders) if ($OtherHeaders =~ /\w/); 201 202 # Add New Text 203 $NewText = &text_into_html($NewText); 204 $doc_obj->add_metadata ($cursection, "NewText", $NewText) if ($NewText =~ /\w/); 205 206 # Add text 207 $text =~ s/<BR>\s*<BR>/<P>/g; 178 # Add "All headers" metadata 179 $Headers = &text_into_html($Headers); 180 $Headers = "No headers" unless ($Headers =~ /\w/); 181 $doc_obj->add_metadata ($cursection, "Headers", $Headers); 182 183 # Add document text 208 184 $text = &text_into_html($text); 209 $doc_obj->add_text ($cursection, $text) if ($text =~ /\w/); 185 $text = "No message" unless ($text =~ /\w/); 186 $doc_obj->add_text ($cursection, $text); 210 187 211 188 # Add the OID - that is, the big HASH value used as a unique ID … … 215 192 $processor->process($doc_obj); 216 193 217 return 1; # processed the file 218 } 219 220 221 1; 222 223 224 225 # 194 # Return the number of documents processed 195 return 1; 196 197 } 198 199 226 200 # Convert a text string into HTML. 227 201 # … … 234 208 # and replaces carriage returns with <BR> tags (and multiple carriage 235 209 # returns with <P> tags). 236 # 210 237 211 238 212 sub text_into_html { 239 213 my ($text) = @_; 240 214 241 242 215 # Convert problem charaters into HTML symbols 243 $text =~ s/&/&/g ;244 $text =~ s/</</g ;245 $text =~ s/>/>/g ;246 $text =~ s/\"/"/g ;216 $text =~ s/&/&/go; 217 $text =~ s/</</go; 218 $text =~ s/>/>/go; 219 $text =~ s/\"/"/go; 247 220 248 221 # convert email addresses and URLs into links 249 222 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 250 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\- ]*)/<a href=\"$1">$1<\/a>/g;223 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g; 251 224 252 225 # Clean up whitespace and convert \n charaters to <BR> or <P> 253 $text =~ s/ +/ /g ;254 $text =~ s/\s*$// ;255 $text =~ s/^\s*// ;256 $text =~ s/\n/\n<BR>/g ;257 $text =~ s/<BR>\s*<BR>/<P>/g ;226 $text =~ s/ +/ /go; 227 $text =~ s/\s*$//o; 228 $text =~ s/^\s*//o; 229 $text =~ s/\n/\n<BR>/go; 230 $text =~ s/<BR>\s*<BR>/<P>/go; 258 231 259 232 return $text; 260 233 } 261 234 262 263 264 265 266 267 268 235 236 # Perl packages have to return true if they are run. 237 1; 238 239 240 241 242 243 244
Note:
See TracChangeset
for help on using the changeset viewer.