Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 638

Last change on this file since 638 was 638, checked in by sjboddie, 25 years ago
Gordon's new email plugin thingy
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.2 KB

Line
1	###########################################################################
2	#
3	# EMAILPlug.pm - a plugin for parsing email files
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27
28	#
29	# EMAILPlug reads an email file (*.email)
30	#
31	# Version 1.1 1999 Sep 20 by Gordon Paynter ([email protected])
32	# loosely based on the original HTMLPlug code
33	#
34	# Document text:
35	# The document text consists of all the text occuring after the first
36	# blank line in this document.
37	#
38	# Metadata:
39	# $Subject Subject: header
40	# $To To: header
41	# $From From: header - this will be stored as Creator
42	# $DateText Date: header
43	# $Date Date: header in GSDL format (eg: 19990924)
44	# $OtherHeaders All the other headers
45	# $NewText The unquoted text in this message
46	#
47
48
49	package EMAILPlug;
50
51	use BasPlug;
52	use sorttools;
53	use util;
54
55
56	# EMAILPlug is a sub-class of BasPlug.
57
58	sub BEGIN {
59	@ISA = ('BasPlug');
60	}
61
62
63	# Create a new EMAILPlug object with which to parse a file.
64	# This is done by creating a new BasPlug and usig bless to
65	# turn it into an EMAILPlug.
66
67	sub new {
68	my ($class) = @_;
69	$self = new BasPlug ();
70
71	return bless $self, $class;
72	}
73
74
75	# Is the EMAILPlug recursive? No.
76
77	sub is_recursive {
78	my $self = shift (@_);
79
80	return 0; # this is not a recursive plugin
81	}
82
83
84	#
85	# read
86	#
87	# read attempts to read a file and store its contents in a
88	# new document object.
89	#
90	# Returns: number of files processed or undef if can't process
91	# This plugin only processes one file at a time.
92	#
93	# Note: $base_dir might be "" and $file might include directories,
94	# but that doesn't affect EMAILPlug
95	#
96
97	sub read {
98	my $self = shift (@_);
99	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
100
101	# Make sure file exists and is an email file
102	my $filename = &util::filename_cat($base_dir, $file);
103	return undef unless ($filename =~ /\.email$/i && (-e $filename));
104
105	print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'};
106
107	# create a new document object
108	my $doc_obj = new doc ($file, "indexed_doc");
109	open (FILE, $filename) \|\| die "EMAILPlug::read - can't open $filename\n";
110	my $cursection = $doc_obj->get_top_section();
111
112	# Metadata fields
113	my $Subject = "";
114	my $To = "";
115	my $From = "";
116	my $DateText = "";
117	my $Date = "";
118	my $OtherHeaders = "";
119	my $NewText = "";
120	my $text = "";
121	my $line = "";
122
123	my $headers_read = 0;
124
125	# Read and process each line in te email file.
126	# Each file consists of a set of header lines, then a blank line,
127	# then the body of the email.
128	while (<FILE>) {
129
130	$line = $_;
131
132	# Remove carriage returns from the line.
133	# We will later replace single cariage returns with <BR> tags
134	# and double carriage returns with <P> tags.
135	$line =~ s/\n/ /g;
136
137	if ($headers_read) {
138	# The headers have been read, so add this line to the body text
139	$text .= "$line\n";
140	# If the line isn't quoted, add it to the NewText metadata
141	if ($line =~ /^[^>\|]/) {
142	$NewText .= "$line\n";
143	}
144
145	} elsif ($line =~ /^\s*$/) {
146	# An empty line signals the end of the headers.
147	$headers_read = 1;
148
149	} else {
150	# Read a line of header information and add it to the metadata
151	$line .= "\n";
152	if ($line =~ /^From:/) {
153	$line =~ s/^From:\s*//;
154	$From .= $line;
155	} elsif ($line =~ /^To:/) {
156	$line =~ s/^To:\s*//;
157	$To .= $line;
158	} elsif ($line =~ /^Date:/) {
159	$line =~ s/^Date:\s*//;
160	$DateText .= $line;
161	if ($Date !~ /\d+/) {
162	# Convert the date text to internal date format
163	my ($day, $month, $year) = $line =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d\d)/;
164	$Date = &sorttools::format_date($day, $month, $year);
165	}
166	} elsif ($line =~ /^Subject:/) {
167	$line =~ s/^Subject:\s*//;
168	$Subject .= $line;
169	} else {
170	$OtherHeaders .= $line;
171	}
172	}
173	}
174
175	# Add Subject metadata
176	$Subject = &text_into_html($Subject);
177	$Subject = "No Subject" unless ($Subject =~ /\w/);
178	$doc_obj->add_metadata ($cursection, "Subject", $Subject);
179
180	# Add Sender
181	$From = &text_into_html($From);
182	$From = "No Sender" unless ($From =~ /\w/);
183	$doc_obj->add_metadata ($cursection, "Creator", $From);
184
185	# Add Recipient
186	$To = &text_into_html($To);
187	$To = "No Recipient" unless ($To =~ /\w/);
188	$doc_obj->add_metadata ($cursection, "To", $To);
189
190	# Add Date Text
191	$DateText =~ &text_into_html($Date);
192	$doc_obj->add_metadata ($cursection, "DateText", $DateText) if ($DateText =~ /\w/);
193
194	# Add Date
195	$Date =~ &text_into_html($Date);
196	$doc_obj->add_metadata ($cursection, "Date", $Date) if ($Date =~ /\w/);
197
198	# Add Other Headers
199	$OtherHeaders = &text_into_html($OtherHeaders);
200	$doc_obj->add_metadata ($cursection, "OtherHeaders", $OtherHeaders) if ($OtherHeaders =~ /\w/);
201
202	# Add New Text
203	$NewText = &text_into_html($NewText);
204	$doc_obj->add_metadata ($cursection, "NewText", $NewText) if ($NewText =~ /\w/);
205
206	# Add text
207	$text =~ s/<BR>\s*<BR>/<P>/g;
208	$text = &text_into_html($text);
209	$doc_obj->add_text ($cursection, $text) if ($text =~ /\w/);
210
211	# Add the OID - that is, the big HASH value used as a unique ID
212	$doc_obj->set_OID ();
213
214	# Process the document
215	$processor->process($doc_obj);
216
217	return 1; # processed the file
218	}
219
220
221	1;
222
223
224
225	#
226	# Convert a text string into HTML.
227	#
228	# The HTML is going to be inserted into a GML file, so
229	# we have to be careful not to use symbols like ">",
230	# which ocurs frequently in email messages (and use
231	# &gt instead.
232	#
233	# This function also turns links and email addresses into hyperlinks,
234	# and replaces carriage returns with <BR> tags (and multiple carriage
235	# returns with <P> tags).
236	#
237
238	sub text_into_html {
239	my ($text) = @_;
240
241
242	# Convert problem charaters into HTML symbols
243	$text =~ s/&/&/g;
244	$text =~ s/</</g;
245	$text =~ s/>/>/g;
246	$text =~ s/\"/"/g;
247
248	# convert email addresses and URLs into links
249	$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
250	$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
251
252	# Clean up whitespace and convert \n charaters to <BR> or <P>
253	$text =~ s/ +/ /g;
254	$text =~ s/\s*$//;
255	$text =~ s/^\s*//;
256	$text =~ s/\n/\n<BR>/g;
257	$text =~ s/<BR>\s*<BR>/<P>/g;
258
259	return $text;
260	}
261
262
263
264
265
266
267
268

Note: See TracBrowser for help on using the repository browser.

Download in other formats: