source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 1244

Last change on this file since 1244 was 1244, checked in by sjboddie, 24 years ago

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in mh_mail folders) or with the
35# extension .email
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata:
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header - this will be stored as Creator
46# $DateText Date: header
47# $Date Date: header in GSDL format (eg: 19990924)
48#
49# Version history
50#
51# 1.2 (2000 Jun 12) Major rewrite.
52# (The new version of Greenstone breaks some of the metadata.)
53# 1.1.1 Compensated for two-digit years like "95"
54# 1.1 (1999 Sep 20) Introduced the various metadata fileds
55# 1.0 Based on the original HTMLPlug code
56#
57
58
59package EMAILPlug;
60
61use BasPlug;
62use sorttools;
63use util;
64
65
66# EMAILPlug is a sub-class of BasPlug.
67
68sub BEGIN {
69 @ISA = ('BasPlug');
70}
71
72use strict;
73
74# Create a new EMAILPlug object with which to parse a file.
75# Accomplished by creating a new BasPlug and using bless to
76# turn it into an EMAILPlug.
77
78sub new {
79 my ($class) = @_;
80 my $self = new BasPlug ("EMAILPlug", @_);
81
82 return bless $self, $class;
83}
84
85sub get_default_process_exp {
86 my $self = shift (@_);
87
88 return q^\d+(\.email)?$^;
89}
90
91# do plugin specific processing of doc_obj
92sub process {
93 my $self = shift (@_);
94 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
95
96 # Check that we're dealing with a valid mail file
97 return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/));
98
99 print STDERR "EMAILPlug: processing $file\n"
100 if $self->{'verbosity'} > 1;
101
102 my $cursection = $doc_obj->get_top_section();
103
104 #
105 # Parse the document's text and extract metadata
106 #
107
108 # Separate header from body of message
109 my $Headers = $$textref;
110 $Headers =~ s/\n\n.*//s;
111 $$textref = substr $$textref, (length $Headers);
112
113 # Extract basic metadata from header
114 my @headers = ("From", "To", "Subject", "Date");
115 my $value = "";
116 my %raw;
117
118 foreach my $name (@headers) {
119 $value = $Headers;
120 $value =~ s/.*$name://s;
121 $value =~ s/\S*:.*//s;
122 $value =~ s/\s*$//;
123 $value =~ s/\s+/ /g;
124 $raw{$name} = $value;
125 }
126
127 # Process Date information
128 if ($raw{"Date"}) {
129 $raw{"DateText"} = $raw{"Date"};
130
131 # Convert the date text to internal date format
132 $value = $raw{"Date"};
133 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
134 if ($year < 100) { $year += 1900; }
135 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
136
137 } else {
138 # We have not extracted a date
139 $raw{"DateText"} = "Unknown.";
140 $raw{"Date"} = "19000000";
141 }
142
143
144 # Add extracted metadata to document object
145 foreach my $name (keys %raw) {
146 $value = $raw{$name};
147 if ($value) {
148 $value = &text_into_html($value);
149 } else {
150 $value = "No $name field";
151 }
152 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
153 }
154
155 # Add "All headers" metadata
156 $Headers = &text_into_html($Headers);
157 $Headers = "No headers" unless ($Headers =~ /\w/);
158 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
159
160 # Add text to document object
161 $$textref = &text_into_html($$textref);
162 $$textref = "No message" unless ($$textref =~ /\w/);
163 $doc_obj->add_utf8_text($cursection, $$textref);
164
165 return 1;
166}
167
168
169# Convert a text string into HTML.
170#
171# The HTML is going to be inserted into a GML file, so
172# we have to be careful not to use symbols like ">",
173# which ocurs frequently in email messages (and use
174# &gt instead.
175#
176# This function also turns links and email addresses into hyperlinks,
177# and replaces carriage returns with <BR> tags (and multiple carriage
178# returns with <P> tags).
179
180
181sub text_into_html {
182 my ($text) = @_;
183
184 # Convert problem characters into HTML symbols
185 $text =~ s/&/&amp;/go;
186 $text =~ s/</&lt;/go;
187 $text =~ s/>/&gt;/go;
188 $text =~ s/\"/&quot;/go;
189
190 # convert email addresses and URLs into links
191 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
192 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
193
194 # Clean up whitespace and convert \n charaters to <BR> or <P>
195 $text =~ s/ +/ /go;
196 $text =~ s/\s*$//o;
197 $text =~ s/^\s*//o;
198 $text =~ s/\n/\n<BR>/go;
199 $text =~ s/<BR>\s*<BR>/<P>/go;
200
201 return $text;
202}
203
204
205# Perl packages have to return true if they are run.
2061;
Note: See TracBrowser for help on using the repository browser.