Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1206

Timestamp:

2000-06-13T09:50:15+12:00 (24 years ago)

Author:

gwp

Message:

A thorough rewrite; some of the metadata was flawed in such a way
that the new version of Greenstone was having trouble during the
building process. There are some improvements: simplified metadata,
it is possible to search all the headers at once, multi-line headers
are properly parsed, and messages no longer require a .email extension.

File:

: 1 edited

trunk/gsdl/perllib/plugins/EMAILPlug.pm (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r638
+              r1206
+#
+# EMAILPlug reads an email file (*.email)
+#
+# Version 1.1   1999 Sep 20  by Gordon Paynter ([email protected])
+#                            loosely based on the original HTMLPlug code
+# EMAILPlug
+#
+# by Gordon Paynter ([email protected])
+#
+# Email plug reads email files.  These are named with a simple
+# number (i.e. as they appear in mh_mail folders) or with the
+# extension .email
+#
 # Document text:
 #   The document text consists of all the text occuring after the first
 #   blank line in this document.
+#   The document text consists of all the text
+#   after the first blank line in the document.
+#
 # Metadata:
+#   $Headers      All the header content
 #   $Subject      Subject: header
 #   $To           To: header
 …
 #   $DateText     Date: header
 #   $Date         Date: header in GSDL format (eg: 19990924)
+#   $OtherHeaders All the other headers
+#   $NewText      The unquoted text in this message
+#
+# Version history
+#
+# 1.2   (2000 Jun 12) Major rewrite.
+#       (The new version of Greenstone breaks some of the metadata.)
+# 1.1.1 Compensated for two-digit years like "95"
+# 1.1   (1999 Sep 20) Introduced the various metadata fileds
+# 1.0   Based on the original HTMLPlug code
+#
 …
 # EMAILPlug is a sub-class of BasPlug.
 sub BEGIN {
+sub BEGIN {
     @ISA = ('BasPlug');
+}
 …
 # Create a new EMAILPlug object with which to parse a file.
 # This is done by creating a new BasPlug and usig bless to
+# Accomplished by creating a new BasPlug and using bless to
 # turn it into an EMAILPlug.
 …
     my ($class) = @_;
     $self = new BasPlug ();
     return bless $self, $class;
+}
 # Is the EMAILPlug recursive?  No.
+# Is EMAILPlug recursive?  No.
 sub is_recursive {
+    my $self = shift (@_);
+    return 0; # this is not a recursive plugin
+}
+#
+# read
+#
+# read attempts to read a file and store its contents in a
+# new document object.
+#
+# Returns: number of files processed or undef if can't process
+# This plugin only processes one file at a time.
+#
+# Note: $base_dir might be "" and $file might include directories,
+# but that doesn't affect EMAILPlug
+#
+    return 0;
+}
+# Read a file and store its contents in a new document object.
+# First, we check to see if it is an email message we're dealing
+# with, then we extract the text and metadata, then we store
+# all this information.
+#
+# Returns: number of files processed or undef if it can't process
+# a file.  This plugin only processes one file at a time.
 sub read {
 …
     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
+    # Make sure file exists and is an email file
+    #
+    # Check that we're dealig with a valid mail file
+    #
+    # Make sure file exists
     my $filename = &util::filename_cat($base_dir, $file);
+    return undef unless ($filename =~ /\.email$/i && (-e $filename));
+    return undef unless (-e $filename);
+    return undef unless ($filename =~ /\d+(\.email)?$/);
+    # Read the text and make sure it is an email message
+    open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
+    my @text = <FILE>;
+    my $text = join("", @text);
+    return undef unless (($text =~ /From:/) || ($text =~ /To:/));
     print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'};
+    # create a new document object
+    #
+    # Parse the document's text and extract metadata
+    #
+    # Separate header from body of message
+    my $Headers = $text;
+    $Headers =~ s/\n\n.*//s;
+    $text = substr $text, (length $Headers);
+    # Extract basic metadata from header
+    my @headers = ("From", "To", "Subject", "Date");
+    my $value = "";
+    my %raw;
+    foreach my $name (@headers) {
+    $value = $Headers;
+    $value =~ s/.*$name://s;
+    $value =~ s/\S*:.*//s;
+    $value =~ s/\s*$//;
+    $value =~ s/\s+/ /g;
+    $raw{$name} = $value;
+    }
+    # Process Date information
+    if ($raw{"Date"}) {
+    $raw{"DateText"} = $raw{"Date"};
+    # Convert the date text to internal date format
+    $value = $raw{"Date"};
+    my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
+    if ($year < 100) { $year += 1900; }
+    $raw{"Date"} = &sorttools::format_date($day, $month, $year);
+    } else {
+    # We have not extracted a date
+    $raw{"DateText"} = "Unknown.";
+    $raw{"Date"} = "19000000";
+    }
+    #
+    # Create a new document object
+    #
     my $doc_obj = new doc ($file, "indexed_doc");
-    open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
     my $cursection = $doc_obj->get_top_section();
+    # Metadata fields
+    my $Subject = "";
+    my $To = "";
+    my $From = "";
+    my $DateText = "";
+    my $Date = "";
+    my $OtherHeaders = "";
+    my $NewText = "";
+    my $text = "";
+    my $line = "";
+    my $headers_read = 0;
+    # Read and process each line in te email file.
+    # Each file consists of a set of header lines, then a blank line,
+    # then the body of the email.
+    while (<FILE>) {
+    $line = $_;
+    # Remove carriage returns from the line.
+    # We will later replace single cariage returns with <BR> tags
+    # and double carriage returns with <P> tags.
+    $line =~ s/\n/ /g;
+        if ($headers_read) {
+        # The headers have been read, so add this line to the body text
+        $text .= "$line\n";
+            # If the line isn't quoted, add it to the NewText metadata
+        if ($line =~ /^[^>|]/) {
+        $NewText .= "$line\n";
+        }
+    } elsif ($line =~ /^\s*$/) {
+        # An empty line signals the end of the headers.
+        $headers_read = 1;
+    # Add specilised metadata
+    foreach my $name (keys %raw) {
+    $value = $raw{$name};
+    if ($value) {
+        $value = &text_into_html($value);
     } else {
+        # Read a line of header information and add it to the metadata
+        $line .= "\n";
+        if ($line =~ /^From:/) {
+        $line =~ s/^From:\s*//;
+        $From .= $line;
+        } elsif ($line =~ /^To:/) {
+        $line =~ s/^To:\s*//;
+        $To .= $line;
+        } elsif ($line =~ /^Date:/) {
+        $line =~ s/^Date:\s*//;
+        $DateText .= $line;
+        if ($Date !~ /\d+/) {
+            # Convert the date text to internal date format
+            my ($day, $month, $year) = $line =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d\d)/;
+            $Date = &sorttools::format_date($day, $month, $year);
+        }
+        } elsif ($line =~ /^Subject:/) {
+        $line =~ s/^Subject:\s*//;
+        $Subject .= $line;
+        } else {
+        $OtherHeaders .= $line;
+        }
+    }
+        $value = "No $name field";
+    }
+    $doc_obj->add_metadata ($cursection, $name, $value);
+    }
+    # Add Subject metadata
+    $Subject = &text_into_html($Subject);
+    $Subject = "No Subject" unless ($Subject =~ /\w/);
+    $doc_obj->add_metadata ($cursection, "Subject", $Subject);
+    # Add Sender
+    $From = &text_into_html($From);
+    $From = "No Sender" unless ($From =~ /\w/);
+    $doc_obj->add_metadata ($cursection, "Creator", $From);
+    # Add Recipient
+    $To = &text_into_html($To);
+    $To = "No Recipient" unless ($To =~ /\w/);
+    $doc_obj->add_metadata ($cursection, "To", $To);
+    # Add Date Text
+    $DateText =~ &text_into_html($Date);
+    $doc_obj->add_metadata ($cursection, "DateText", $DateText) if ($DateText =~ /\w/);
+    # Add Date
+    $Date =~ &text_into_html($Date);
+    $doc_obj->add_metadata ($cursection, "Date", $Date) if ($Date =~ /\w/);
+    # Add Other Headers
+    $OtherHeaders = &text_into_html($OtherHeaders);
+    $doc_obj->add_metadata ($cursection, "OtherHeaders", $OtherHeaders) if ($OtherHeaders =~ /\w/);
+    # Add New Text
+    $NewText = &text_into_html($NewText);
+    $doc_obj->add_metadata ($cursection, "NewText", $NewText) if ($NewText =~ /\w/);
+    # Add text
+    $text =~ s/<BR>\s*<BR>/<P>/g;
+    # Add "All headers" metadata
+    $Headers = &text_into_html($Headers);
+    $Headers = "No headers" unless ($Headers =~ /\w/);
+    $doc_obj->add_metadata ($cursection, "Headers", $Headers);
+    # Add document text
     $text = &text_into_html($text);
+    $doc_obj->add_text ($cursection, $text) if ($text =~ /\w/);
+    $text = "No message" unless ($text =~ /\w/);
+    $doc_obj->add_text ($cursection, $text);
     # Add the OID - that is, the big HASH value used as a unique ID
 …
     $processor->process($doc_obj);
+    return 1; # processed the file
+}
+;
+#
+    # Return the number of documents processed
+    return 1;
+}
 # Convert a text string into HTML.
+#
 …
 # and replaces carriage returns with <BR> tags (and multiple carriage
 # returns with <P> tags).
+#
 sub text_into_html {
     my ($text) = @_;
     # Convert problem charaters into HTML symbols
     $text =~ s/&/&amp;/g;
     $text =~ s/</&lt;/g;
     $text =~ s/>/&gt;/g;
     $text =~ s/\"/&quot;/g;
+    $text =~ s/&/&amp;/go;
+    $text =~ s/</&lt;/go;
+    $text =~ s/>/&gt;/go;
+    $text =~ s/\"/&quot;/go;
     # convert email addresses and URLs into links
     $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
     $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
+    $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
     # Clean up whitespace and convert \n charaters to <BR> or <P>
     $text =~ s/ +/ /g;
     $text =~ s/\s*$//;
     $text =~ s/^\s*//;
     $text =~ s/\n/\n<BR>/g;
     $text =~ s/<BR>\s*<BR>/<P>/g;
+    $text =~ s/ +/ /go;
+    $text =~ s/\s*$//o;
+    $text =~ s/^\s*//o;
+    $text =~ s/\n/\n<BR>/go;
+    $text =~ s/<BR>\s*<BR>/<P>/go;
     return $text;
+}
+# Perl packages have to return true if they are run.
+;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1206

Legend:

trunk/gsdl/perllib/plugins/EMAILPlug.pm

Download in other formats: