source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 638

Last change on this file since 638 was 638, checked in by sjboddie, 25 years ago

Gordon's new email plugin thingy

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28#
29# EMAILPlug reads an email file (*.email)
30#
31# Version 1.1 1999 Sep 20 by Gordon Paynter ([email protected])
32# loosely based on the original HTMLPlug code
33#
34# Document text:
35# The document text consists of all the text occuring after the first
36# blank line in this document.
37#
38# Metadata:
39# $Subject Subject: header
40# $To To: header
41# $From From: header - this will be stored as Creator
42# $DateText Date: header
43# $Date Date: header in GSDL format (eg: 19990924)
44# $OtherHeaders All the other headers
45# $NewText The unquoted text in this message
46#
47
48
49package EMAILPlug;
50
51use BasPlug;
52use sorttools;
53use util;
54
55
56# EMAILPlug is a sub-class of BasPlug.
57
58sub BEGIN {
59 @ISA = ('BasPlug');
60}
61
62
63# Create a new EMAILPlug object with which to parse a file.
64# This is done by creating a new BasPlug and usig bless to
65# turn it into an EMAILPlug.
66
67sub new {
68 my ($class) = @_;
69 $self = new BasPlug ();
70
71 return bless $self, $class;
72}
73
74
75# Is the EMAILPlug recursive? No.
76
77sub is_recursive {
78 my $self = shift (@_);
79
80 return 0; # this is not a recursive plugin
81}
82
83
84#
85# read
86#
87# read attempts to read a file and store its contents in a
88# new document object.
89#
90# Returns: number of files processed or undef if can't process
91# This plugin only processes one file at a time.
92#
93# Note: $base_dir might be "" and $file might include directories,
94# but that doesn't affect EMAILPlug
95#
96
97sub read {
98 my $self = shift (@_);
99 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
100
101 # Make sure file exists and is an email file
102 my $filename = &util::filename_cat($base_dir, $file);
103 return undef unless ($filename =~ /\.email$/i && (-e $filename));
104
105 print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'};
106
107 # create a new document object
108 my $doc_obj = new doc ($file, "indexed_doc");
109 open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
110 my $cursection = $doc_obj->get_top_section();
111
112 # Metadata fields
113 my $Subject = "";
114 my $To = "";
115 my $From = "";
116 my $DateText = "";
117 my $Date = "";
118 my $OtherHeaders = "";
119 my $NewText = "";
120 my $text = "";
121 my $line = "";
122
123 my $headers_read = 0;
124
125 # Read and process each line in te email file.
126 # Each file consists of a set of header lines, then a blank line,
127 # then the body of the email.
128 while (<FILE>) {
129
130 $line = $_;
131
132 # Remove carriage returns from the line.
133 # We will later replace single cariage returns with <BR> tags
134 # and double carriage returns with <P> tags.
135 $line =~ s/\n/ /g;
136
137 if ($headers_read) {
138 # The headers have been read, so add this line to the body text
139 $text .= "$line\n";
140 # If the line isn't quoted, add it to the NewText metadata
141 if ($line =~ /^[^>|]/) {
142 $NewText .= "$line\n";
143 }
144
145 } elsif ($line =~ /^\s*$/) {
146 # An empty line signals the end of the headers.
147 $headers_read = 1;
148
149 } else {
150 # Read a line of header information and add it to the metadata
151 $line .= "\n";
152 if ($line =~ /^From:/) {
153 $line =~ s/^From:\s*//;
154 $From .= $line;
155 } elsif ($line =~ /^To:/) {
156 $line =~ s/^To:\s*//;
157 $To .= $line;
158 } elsif ($line =~ /^Date:/) {
159 $line =~ s/^Date:\s*//;
160 $DateText .= $line;
161 if ($Date !~ /\d+/) {
162 # Convert the date text to internal date format
163 my ($day, $month, $year) = $line =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d\d)/;
164 $Date = &sorttools::format_date($day, $month, $year);
165 }
166 } elsif ($line =~ /^Subject:/) {
167 $line =~ s/^Subject:\s*//;
168 $Subject .= $line;
169 } else {
170 $OtherHeaders .= $line;
171 }
172 }
173 }
174
175 # Add Subject metadata
176 $Subject = &text_into_html($Subject);
177 $Subject = "No Subject" unless ($Subject =~ /\w/);
178 $doc_obj->add_metadata ($cursection, "Subject", $Subject);
179
180 # Add Sender
181 $From = &text_into_html($From);
182 $From = "No Sender" unless ($From =~ /\w/);
183 $doc_obj->add_metadata ($cursection, "Creator", $From);
184
185 # Add Recipient
186 $To = &text_into_html($To);
187 $To = "No Recipient" unless ($To =~ /\w/);
188 $doc_obj->add_metadata ($cursection, "To", $To);
189
190 # Add Date Text
191 $DateText =~ &text_into_html($Date);
192 $doc_obj->add_metadata ($cursection, "DateText", $DateText) if ($DateText =~ /\w/);
193
194 # Add Date
195 $Date =~ &text_into_html($Date);
196 $doc_obj->add_metadata ($cursection, "Date", $Date) if ($Date =~ /\w/);
197
198 # Add Other Headers
199 $OtherHeaders = &text_into_html($OtherHeaders);
200 $doc_obj->add_metadata ($cursection, "OtherHeaders", $OtherHeaders) if ($OtherHeaders =~ /\w/);
201
202 # Add New Text
203 $NewText = &text_into_html($NewText);
204 $doc_obj->add_metadata ($cursection, "NewText", $NewText) if ($NewText =~ /\w/);
205
206 # Add text
207 $text =~ s/<BR>\s*<BR>/<P>/g;
208 $text = &text_into_html($text);
209 $doc_obj->add_text ($cursection, $text) if ($text =~ /\w/);
210
211 # Add the OID - that is, the big HASH value used as a unique ID
212 $doc_obj->set_OID ();
213
214 # Process the document
215 $processor->process($doc_obj);
216
217 return 1; # processed the file
218}
219
220
2211;
222
223
224
225#
226# Convert a text string into HTML.
227#
228# The HTML is going to be inserted into a GML file, so
229# we have to be careful not to use symbols like ">",
230# which ocurs frequently in email messages (and use
231# &gt instead.
232#
233# This function also turns links and email addresses into hyperlinks,
234# and replaces carriage returns with <BR> tags (and multiple carriage
235# returns with <P> tags).
236#
237
238sub text_into_html {
239 my ($text) = @_;
240
241
242 # Convert problem charaters into HTML symbols
243 $text =~ s/&/&amp;/g;
244 $text =~ s/</&lt;/g;
245 $text =~ s/>/&gt;/g;
246 $text =~ s/\"/&quot;/g;
247
248 # convert email addresses and URLs into links
249 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
250 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
251
252 # Clean up whitespace and convert \n charaters to <BR> or <P>
253 $text =~ s/ +/ /g;
254 $text =~ s/\s*$//;
255 $text =~ s/^\s*//;
256 $text =~ s/\n/\n<BR>/g;
257 $text =~ s/<BR>\s*<BR>/<P>/g;
258
259 return $text;
260}
261
262
263
264
265
266
267
268
Note: See TracBrowser for help on using the repository browser.