Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 1895

Last change on this file since 1895 was 1895, checked in by jrm21, 23 years ago
Email plug now uses SplitPlug for mbox mail files. Hopefully this won't cause problems with the other file formats (eg maildir) which don't use the magic regexp /^{From / as a mail separator...}
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.0 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in mh_mail folders) or with the
35	# extension .email
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata:
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header - this will be stored as Creator
46	# $DateText Date: header
47	# $Date Date: header in GSDL format (eg: 19990924)
48
49	package EMAILPlug;
50
51	use SplitPlug;
52
53	use sorttools;
54	use util;
55
56
57	# EMAILPlug is a sub-class of SplitPlug.
58
59	sub BEGIN {
60	@ISA = ('SplitPlug');
61	}
62
63	# Create a new EMAILPlug object with which to parse a file.
64	# Accomplished by creating a new BasPlug and using bless to
65	# turn it into an EMAILPlug.
66
67	sub new {
68	my ($class) = @_;
69	my $self = new BasPlug ("EMAILPlug", @_);
70
71	return bless $self, $class;
72	}
73
74	sub get_default_process_exp {
75	my $self = shift (@_);
76
77	return q^\d+(\.email)?$^;
78	}
79
80	# This plugin splits the mbox mail files at lines starting with From<sp>
81	sub get_default_split_exp {
82	return q^From .*\n^;
83	}
84
85
86	# do plugin specific processing of doc_obj
87	sub process {
88	my $self = shift (@_);
89	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
90	my $outhandle = $self->{'outhandle'};
91
92	# Check that we're dealing with a valid mail file
93	return undef unless (($$textref =~ /From:/) \|\| ($$textref =~ /To:/));
94
95	# slightly more strict validity check, to prevent us from matching
96	# .so.x files ...
97	return undef unless (($$textref =~ /^From /) \|\|
98	($$textref =~ /^[-A-Za-z]{2,100}:/));
99
100	print $outhandle "EMAILPlug: processing $file\n"
101	if $self->{'verbosity'} > 1;
102
103	my $cursection = $doc_obj->get_top_section();
104
105	#
106	# Parse the document's text and extract metadata
107	#
108
109	# Separate header from body of message
110	my $Headers = $$textref;
111	$Headers =~ s/\n\n.*//s;
112	$$textref = substr $$textref, (length $Headers);
113
114	# Extract basic metadata from header
115	my @headers = ("From", "To", "Subject", "Date");
116	my %raw;
117	foreach my $name (@headers) {
118	$raw{$name} = "No $name value";
119	}
120
121	# Examine each line of the headers
122	my ($line, $name, $value);
123	my @parts;
124	foreach $line (split(/\n/, $Headers)) {
125
126	# Ignore lines with no content or which begin with whitespace
127	next unless ($line =~ /:/);
128	next if ($line =~ /^\s/);
129
130	# Find out what metadata is on this line
131	@parts = split(/:/, $line);
132	$name = shift @parts;
133	next unless $name;
134	next unless ($raw{$name});
135
136	# Find the value of that metadata
137	$value = join(":", @parts);
138	$value =~ s/^\s+//;
139	$value =~ s/\s+$//;
140
141	# Store the metadata
142	$raw{$name} = $value;
143	}
144
145	# Process Date information
146	if ($raw{"Date"} !~ /No Date/) {
147	$raw{"DateText"} = $raw{"Date"};
148
149	# Convert the date text to internal date format
150	$value = $raw{"Date"};
151	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
152	if ($year < 100) { $year += 1900; }
153	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
154
155	} else {
156	# We have not extracted a date
157	$raw{"DateText"} = "Unknown.";
158	$raw{"Date"} = "19000000";
159	}
160
161
162	# Add extracted metadata to document object
163	foreach my $name (keys %raw) {
164	$value = $raw{$name};
165	if ($value) {
166	$value = &text_into_html($value);
167	} else {
168	$value = "No $name field";
169	}
170	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
171	}
172
173	# Add "All headers" metadata
174	$Headers = &text_into_html($Headers);
175	$Headers = "No headers" unless ($Headers =~ /\w/);
176	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
177
178	# Add text to document object
179	$$textref = &text_into_html($$textref);
180	$$textref = "No message" unless ($$textref =~ /\w/);
181	$doc_obj->add_utf8_text($cursection, $$textref);
182
183	return 1;
184	}
185
186
187	# Convert a text string into HTML.
188	#
189	# The HTML is going to be inserted into a GML file, so
190	# we have to be careful not to use symbols like ">",
191	# which ocurs frequently in email messages (and use
192	# &gt instead.
193	#
194	# This function also turns links and email addresses into hyperlinks,
195	# and replaces carriage returns with <BR> tags (and multiple carriage
196	# returns with <P> tags).
197
198
199	sub text_into_html {
200	my ($text) = @_;
201
202	# Convert problem characters into HTML symbols
203	$text =~ s/&/&/go;
204	$text =~ s/</</go;
205	$text =~ s/>/>/go;
206	$text =~ s/\"/"/go;
207
208	# convert email addresses and URLs into links
209	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
210	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
211
212	# Clean up whitespace and convert \n charaters to <BR> or <P>
213	$text =~ s/ +/ /go;
214	$text =~ s/\s*$//o;
215	$text =~ s/^\s*//o;
216	$text =~ s/\n/\n<BR>/go;
217	$text =~ s/<BR>\s*<BR>/<P>/go;
218
219	return $text;
220	}
221
222
223	# Perl packages have to return true if they are run.
224	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: