source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 2096

Last change on this file since 2096 was 2096, checked in by jrm21, 23 years ago

Minor changes to regexs, so that header fields have to be at start of line.
(eg /From:/ -> /From:/m). Also changed file extension so that we process
files named with either a number (maildir) or .mbx (mbox). For now, .email
is also recognised...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata:
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header - this will be stored as Creator
46# $DateText Date: header
47# $Date Date: header in GSDL format (eg: 19990924)
48
49package EMAILPlug;
50
51use SplitPlug;
52
53use sorttools;
54use util;
55
56
57# EMAILPlug is a sub-class of SplitPlug.
58
59sub BEGIN {
60 @ISA = ('SplitPlug');
61}
62
63# Create a new EMAILPlug object with which to parse a file.
64# Accomplished by creating a new BasPlug and using bless to
65# turn it into an EMAILPlug.
66
67sub new {
68 my ($class) = @_;
69 my $self = new BasPlug ("EMAILPlug", @_);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76 # mbx/email for mailbox file format, \d+ for maildir (each message is
77 # in a separate file, with a unique number for filename)
78 return q@[\\/]\d+|\.(mbx|email)$@;
79}
80
81# This plugin splits the mbox mail files at lines starting with From<sp>
82sub get_default_split_exp {
83 return q^\nFrom .*\n^;
84}
85
86
87# do plugin specific processing of doc_obj
88sub process {
89 my $self = shift (@_);
90 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
91 my $outhandle = $self->{'outhandle'};
92
93 # Check that we're dealing with a valid mail file
94 return undef unless (($$textref =~ /^From:/m) || ($$textref =~ /^To:/m));
95
96 # slightly more strict validity check, to prevent us from matching
97 # .so.x files ...
98 return undef unless (($$textref =~ /^From /) ||
99 ($$textref =~ /^[-A-Za-z]{2,100}:/m));
100
101 print $outhandle "EMAILPlug: processing $file\n"
102 if $self->{'verbosity'} > 1;
103
104 my $cursection = $doc_obj->get_top_section();
105
106 #
107 # Parse the document's text and extract metadata
108 #
109
110 # Separate header from body of message
111 my $Headers = $$textref;
112 $Headers =~ s/\n\n.*//s;
113 $$textref = substr $$textref, (length $Headers);
114
115 # Extract basic metadata from header
116 my @headers = ("From", "To", "Subject", "Date");
117 my %raw;
118 foreach my $name (@headers) {
119 $raw{$name} = "No $name value";
120 }
121
122 # Examine each line of the headers
123 my ($line, $name, $value);
124 my @parts;
125 foreach $line (split(/\n/, $Headers)) {
126
127 # Ignore lines with no content or which begin with whitespace
128 next unless ($line =~ /:/);
129 next if ($line =~ /^\s/);
130
131 # Find out what metadata is on this line
132 @parts = split(/:/, $line);
133 $name = shift @parts;
134 next unless $name;
135 next unless ($raw{$name});
136
137 # Find the value of that metadata
138 $value = join(":", @parts);
139 $value =~ s/^\s+//;
140 $value =~ s/\s+$//;
141
142 # Store the metadata
143 $raw{$name} = $value;
144 }
145
146 # Process Date information
147 if ($raw{"Date"} !~ /No Date/) {
148 $raw{"DateText"} = $raw{"Date"};
149
150 # Convert the date text to internal date format
151 $value = $raw{"Date"};
152 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
153 if ($year < 100) { $year += 1900; }
154 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
155
156 } else {
157 # We have not extracted a date
158 $raw{"DateText"} = "Unknown.";
159 $raw{"Date"} = "19000000";
160 }
161
162
163 # Add extracted metadata to document object
164 foreach my $name (keys %raw) {
165 $value = $raw{$name};
166 if ($value) {
167 $value = &text_into_html($value);
168 } else {
169 $value = "No $name field";
170 }
171 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
172 }
173
174 # Add "All headers" metadata
175 $Headers = &text_into_html($Headers);
176 $Headers = "No headers" unless ($Headers =~ /\w/);
177 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
178
179 # Add text to document object
180 $$textref = &text_into_html($$textref);
181 $$textref = "No message" unless ($$textref =~ /\w/);
182 $doc_obj->add_utf8_text($cursection, $$textref);
183
184 return 1;
185}
186
187
188# Convert a text string into HTML.
189#
190# The HTML is going to be inserted into a GML file, so
191# we have to be careful not to use symbols like ">",
192# which ocurs frequently in email messages (and use
193# &gt instead.
194#
195# This function also turns links and email addresses into hyperlinks,
196# and replaces carriage returns with <BR> tags (and multiple carriage
197# returns with <P> tags).
198
199
200sub text_into_html {
201 my ($text) = @_;
202
203 # Convert problem characters into HTML symbols
204 $text =~ s/&/&amp;/go;
205 $text =~ s/</&lt;/go;
206 $text =~ s/>/&gt;/go;
207 $text =~ s/\"/&quot;/go;
208
209 # convert email addresses and URLs into links
210 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
211 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
212
213 # Clean up whitespace and convert \n charaters to <BR> or <P>
214 $text =~ s/ +/ /go;
215 $text =~ s/\s*$//o;
216 $text =~ s/^\s*//o;
217 $text =~ s/\n/\n<BR>/go;
218 $text =~ s/<BR>\s*<BR>/<P>/go;
219
220 return $text;
221}
222
223
224# Perl packages have to return true if they are run.
2251;
Note: See TracBrowser for help on using the repository browser.