Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 2493

Last change on this file since 2493 was 2493, checked in by paynter, 23 years ago
Changed at the request of Marcio - see mailing list.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.3 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata:
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header - this will be stored as Creator
46	# $DateText Date: header
47	# $Date Date: header in GSDL format (eg: 19990924)
48
49	package EMAILPlug;
50
51	use SplitPlug;
52
53	use sorttools;
54	use util;
55
56
57	# EMAILPlug is a sub-class of SplitPlug.
58
59	sub BEGIN {
60	@ISA = ('SplitPlug');
61	}
62
63	# Create a new EMAILPlug object with which to parse a file.
64	# Accomplished by creating a new BasPlug and using bless to
65	# turn it into an EMAILPlug.
66
67	sub new {
68	my ($class) = @_;
69	my $self = new BasPlug ("EMAILPlug", @_);
70
71	return bless $self, $class;
72	}
73
74	sub get_default_process_exp {
75	my $self = shift (@_);
76	# mbx/email for mailbox file format, \d+ for maildir (each message is
77	# in a separate file, with a unique number for filename)
78	return q@[\\/]\d+\|\.(mbx\|email)$@;
79	}
80
81	# This plugin splits the mbox mail files at lines starting with From<sp>
82	sub get_default_split_exp {
83	return q^\nFrom .*\n^;
84	}
85
86
87	# do plugin specific processing of doc_obj
88	sub process {
89	my $self = shift (@_);
90	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
91	my $outhandle = $self->{'outhandle'};
92
93	# Check that we're dealing with a valid mail file
94	return undef unless (($$textref =~ /^From:/m) \|\| ($$textref =~ /^To:/m));
95
96	# slightly more strict validity check, to prevent us from matching
97	# .so.x files ...
98	return undef unless (($$textref =~ /^From /) \|\|
99	($$textref =~ /^[-A-Za-z]{2,100}:/m));
100
101	print $outhandle "EMAILPlug: processing $file\n"
102	if $self->{'verbosity'} > 1;
103
104	my $cursection = $doc_obj->get_top_section();
105
106	#
107	# Parse the document's text and extract metadata
108	#
109
110	# Separate header from body of message
111	my $Headers = $$textref;
112	#$Headers =~ s/\n\n.*//s; # This line changed at Marcio's request
113	$Headers =~ s/\x0a\x0d?\x0a.*//s;
114	$$textref = substr $$textref, (length $Headers);
115
116
117	# Extract basic metadata from header
118	my @headers = ("From", "To", "Subject", "Date");
119	my %raw;
120	foreach my $name (@headers) {
121	$raw{$name} = "No $name value";
122	}
123
124	# Examine each line of the headers
125	my ($line, $name, $value);
126	my @parts;
127	foreach $line (split(/\n/, $Headers)) {
128
129	# Ignore lines with no content or which begin with whitespace
130	next unless ($line =~ /:/);
131	next if ($line =~ /^\s/);
132
133	# Find out what metadata is on this line
134	@parts = split(/:/, $line);
135	$name = shift @parts;
136	next unless $name;
137	next unless ($raw{$name});
138
139	# Find the value of that metadata
140	$value = join(":", @parts);
141	$value =~ s/^\s+//;
142	$value =~ s/\s+$//;
143
144	# Store the metadata
145	$raw{$name} = $value;
146	}
147
148	# Process Date information
149	if ($raw{"Date"} !~ /No Date/) {
150	$raw{"DateText"} = $raw{"Date"};
151
152	# Convert the date text to internal date format
153	$value = $raw{"Date"};
154	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
155	if ($year < 100) { $year += 1900; }
156	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
157
158	} else {
159	# We have not extracted a date
160	$raw{"DateText"} = "Unknown.";
161	$raw{"Date"} = "19000000";
162	}
163
164
165	# Add extracted metadata to document object
166	foreach my $name (keys %raw) {
167	$value = $raw{$name};
168	if ($value) {
169	$value = &text_into_html($value);
170	} else {
171	$value = "No $name field";
172	}
173	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
174	}
175
176	# Add "All headers" metadata
177	$Headers = &text_into_html($Headers);
178	$Headers = "No headers" unless ($Headers =~ /\w/);
179	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
180
181	# Add text to document object
182	$$textref = &text_into_html($$textref);
183	$$textref = "No message" unless ($$textref =~ /\w/);
184	$doc_obj->add_utf8_text($cursection, $$textref);
185
186	return 1;
187	}
188
189
190	# Convert a text string into HTML.
191	#
192	# The HTML is going to be inserted into a GML file, so
193	# we have to be careful not to use symbols like ">",
194	# which ocurs frequently in email messages (and use
195	# &gt instead.
196	#
197	# This function also turns links and email addresses into hyperlinks,
198	# and replaces carriage returns with <BR> tags (and multiple carriage
199	# returns with <P> tags).
200
201
202	sub text_into_html {
203	my ($text) = @_;
204
205	# Convert problem characters into HTML symbols
206	$text =~ s/&/&/go;
207	$text =~ s/</</go;
208	$text =~ s/>/>/go;
209	$text =~ s/\"/"/go;
210
211	# convert email addresses and URLs into links
212	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
213	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
214
215	# Clean up whitespace and convert \n charaters to <BR> or <P>
216	$text =~ s/ +/ /go;
217	$text =~ s/\s*$//o;
218	$text =~ s/^\s*//o;
219	$text =~ s/\n/\n<BR>/go;
220	$text =~ s/<BR>\s*<BR>/<P>/go;
221
222	return $text;
223	}
224
225
226	# Perl packages have to return true if they are run.
227	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: