Context Navigation

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 2096

Last change on this file since 2096 was 2096, checked in by jrm21, 23 years ago
Minor changes to regexs, so that header fields have to be at start of line. (eg /From:/ -> /^{From:/m). Also changed file extension so that we process files named with either a number (maildir) or .mbx (mbox). For now, .email is also recognised...}
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.2 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28
29	# EMAILPlug
30	#
31	# by Gordon Paynter ([email protected])
32	#
33	# Email plug reads email files. These are named with a simple
34	# number (i.e. as they appear in maildir folders) or with the
35	# extension .mbx (for mbox mail file format)
36	#
37	# Document text:
38	# The document text consists of all the text
39	# after the first blank line in the document.
40	#
41	# Metadata:
42	# $Headers All the header content
43	# $Subject Subject: header
44	# $To To: header
45	# $From From: header - this will be stored as Creator
46	# $DateText Date: header
47	# $Date Date: header in GSDL format (eg: 19990924)
48
49	package EMAILPlug;
50
51	use SplitPlug;
52
53	use sorttools;
54	use util;
55
56
57	# EMAILPlug is a sub-class of SplitPlug.
58
59	sub BEGIN {
60	@ISA = ('SplitPlug');
61	}
62
63	# Create a new EMAILPlug object with which to parse a file.
64	# Accomplished by creating a new BasPlug and using bless to
65	# turn it into an EMAILPlug.
66
67	sub new {
68	my ($class) = @_;
69	my $self = new BasPlug ("EMAILPlug", @_);
70
71	return bless $self, $class;
72	}
73
74	sub get_default_process_exp {
75	my $self = shift (@_);
76	# mbx/email for mailbox file format, \d+ for maildir (each message is
77	# in a separate file, with a unique number for filename)
78	return q@[\\/]\d+\|\.(mbx\|email)$@;
79	}
80
81	# This plugin splits the mbox mail files at lines starting with From<sp>
82	sub get_default_split_exp {
83	return q^\nFrom .*\n^;
84	}
85
86
87	# do plugin specific processing of doc_obj
88	sub process {
89	my $self = shift (@_);
90	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
91	my $outhandle = $self->{'outhandle'};
92
93	# Check that we're dealing with a valid mail file
94	return undef unless (($$textref =~ /^From:/m) \|\| ($$textref =~ /^To:/m));
95
96	# slightly more strict validity check, to prevent us from matching
97	# .so.x files ...
98	return undef unless (($$textref =~ /^From /) \|\|
99	($$textref =~ /^[-A-Za-z]{2,100}:/m));
100
101	print $outhandle "EMAILPlug: processing $file\n"
102	if $self->{'verbosity'} > 1;
103
104	my $cursection = $doc_obj->get_top_section();
105
106	#
107	# Parse the document's text and extract metadata
108	#
109
110	# Separate header from body of message
111	my $Headers = $$textref;
112	$Headers =~ s/\n\n.*//s;
113	$$textref = substr $$textref, (length $Headers);
114
115	# Extract basic metadata from header
116	my @headers = ("From", "To", "Subject", "Date");
117	my %raw;
118	foreach my $name (@headers) {
119	$raw{$name} = "No $name value";
120	}
121
122	# Examine each line of the headers
123	my ($line, $name, $value);
124	my @parts;
125	foreach $line (split(/\n/, $Headers)) {
126
127	# Ignore lines with no content or which begin with whitespace
128	next unless ($line =~ /:/);
129	next if ($line =~ /^\s/);
130
131	# Find out what metadata is on this line
132	@parts = split(/:/, $line);
133	$name = shift @parts;
134	next unless $name;
135	next unless ($raw{$name});
136
137	# Find the value of that metadata
138	$value = join(":", @parts);
139	$value =~ s/^\s+//;
140	$value =~ s/\s+$//;
141
142	# Store the metadata
143	$raw{$name} = $value;
144	}
145
146	# Process Date information
147	if ($raw{"Date"} !~ /No Date/) {
148	$raw{"DateText"} = $raw{"Date"};
149
150	# Convert the date text to internal date format
151	$value = $raw{"Date"};
152	my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
153	if ($year < 100) { $year += 1900; }
154	$raw{"Date"} = &sorttools::format_date($day, $month, $year);
155
156	} else {
157	# We have not extracted a date
158	$raw{"DateText"} = "Unknown.";
159	$raw{"Date"} = "19000000";
160	}
161
162
163	# Add extracted metadata to document object
164	foreach my $name (keys %raw) {
165	$value = $raw{$name};
166	if ($value) {
167	$value = &text_into_html($value);
168	} else {
169	$value = "No $name field";
170	}
171	$doc_obj->add_utf8_metadata ($cursection, $name, $value);
172	}
173
174	# Add "All headers" metadata
175	$Headers = &text_into_html($Headers);
176	$Headers = "No headers" unless ($Headers =~ /\w/);
177	$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
178
179	# Add text to document object
180	$$textref = &text_into_html($$textref);
181	$$textref = "No message" unless ($$textref =~ /\w/);
182	$doc_obj->add_utf8_text($cursection, $$textref);
183
184	return 1;
185	}
186
187
188	# Convert a text string into HTML.
189	#
190	# The HTML is going to be inserted into a GML file, so
191	# we have to be careful not to use symbols like ">",
192	# which ocurs frequently in email messages (and use
193	# &gt instead.
194	#
195	# This function also turns links and email addresses into hyperlinks,
196	# and replaces carriage returns with <BR> tags (and multiple carriage
197	# returns with <P> tags).
198
199
200	sub text_into_html {
201	my ($text) = @_;
202
203	# Convert problem characters into HTML symbols
204	$text =~ s/&/&/go;
205	$text =~ s/</</go;
206	$text =~ s/>/>/go;
207	$text =~ s/\"/"/go;
208
209	# convert email addresses and URLs into links
210	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
211	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
212
213	# Clean up whitespace and convert \n charaters to <BR> or <P>
214	$text =~ s/ +/ /go;
215	$text =~ s/\s*$//o;
216	$text =~ s/^\s*//o;
217	$text =~ s/\n/\n<BR>/go;
218	$text =~ s/<BR>\s*<BR>/<P>/go;
219
220	return $text;
221	}
222
223
224	# Perl packages have to return true if they are run.
225	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: