Changeset 1206


Ignore:
Timestamp:
2000-06-13T09:50:15+12:00 (24 years ago)
Author:
gwp
Message:

A thorough rewrite; some of the metadata was flawed in such a way
that the new version of Greenstone was having trouble during the
building process. There are some improvements: simplified metadata,
it is possible to search all the headers at once, multi-line headers
are properly parsed, and messages no longer require a .email extension.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r638 r1206  
    2626
    2727
    28 #
    29 # EMAILPlug reads an email file (*.email)
    30 #
    31 # Version 1.1   1999 Sep 20  by Gordon Paynter ([email protected])
    32 #                            loosely based on the original HTMLPlug code
     28
     29# EMAILPlug
     30#
     31# by Gordon Paynter ([email protected])
     32#
     33# Email plug reads email files.  These are named with a simple
     34# number (i.e. as they appear in mh_mail folders) or with the
     35# extension .email
    3336#
    3437# Document text:
    35 #   The document text consists of all the text occuring after the first
    36 #   blank line in this document.
     38#   The document text consists of all the text
     39#   after the first blank line in the document.
    3740#
    3841# Metadata:
     42#   $Headers      All the header content
    3943#   $Subject      Subject: header
    4044#   $To           To: header
     
    4246#   $DateText     Date: header
    4347#   $Date         Date: header in GSDL format (eg: 19990924)
    44 #   $OtherHeaders All the other headers
    45 #   $NewText      The unquoted text in this message
     48#
     49# Version history
     50#
     51# 1.2   (2000 Jun 12) Major rewrite.
     52#       (The new version of Greenstone breaks some of the metadata.)
     53# 1.1.1 Compensated for two-digit years like "95"
     54# 1.1   (1999 Sep 20) Introduced the various metadata fileds
     55# 1.0   Based on the original HTMLPlug code
    4656#
    4757
     
    5666# EMAILPlug is a sub-class of BasPlug.
    5767
    58 sub BEGIN {
     68sub BEGIN { 
    5969    @ISA = ('BasPlug');
    6070}
     
    6272
    6373# Create a new EMAILPlug object with which to parse a file.
    64 # This is done by creating a new BasPlug and usig bless to
     74# Accomplished by creating a new BasPlug and using bless to
    6575# turn it into an EMAILPlug.
    6676
     
    6878    my ($class) = @_;
    6979    $self = new BasPlug ();
    70 
    7180    return bless $self, $class;
    7281}
    7382
    7483
    75 # Is the EMAILPlug recursive?  No.
     84# Is EMAILPlug recursive?  No.
    7685
    7786sub is_recursive {
    78     my $self = shift (@_);
    79 
    80     return 0; # this is not a recursive plugin
    81 }
    82 
    83 
    84 #
    85 # read
    86 #
    87 # read attempts to read a file and store its contents in a
    88 # new document object.
    89 #
    90 # Returns: number of files processed or undef if can't process
    91 # This plugin only processes one file at a time.
    92 #
    93 # Note: $base_dir might be "" and $file might include directories,
    94 # but that doesn't affect EMAILPlug
    95 #
     87    return 0;
     88}
     89
     90
     91# Read a file and store its contents in a new document object.
     92# First, we check to see if it is an email message we're dealing
     93# with, then we extract the text and metadata, then we store
     94# all this information.
     95#
     96# Returns: number of files processed or undef if it can't process
     97# a file.  This plugin only processes one file at a time.
    9698
    9799sub read {
     
    99101    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    100102
    101     # Make sure file exists and is an email file
     103    #
     104    # Check that we're dealig with a valid mail file
     105    #
     106
     107    # Make sure file exists
    102108    my $filename = &util::filename_cat($base_dir, $file);
    103     return undef unless ($filename =~ /\.email$/i && (-e $filename));
     109    return undef unless (-e $filename);
     110    return undef unless ($filename =~ /\d+(\.email)?$/);
     111
     112    # Read the text and make sure it is an email message
     113    open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
     114    my @text = <FILE>;
     115    my $text = join("", @text);
     116    return undef unless (($text =~ /From:/) || ($text =~ /To:/));
    104117
    105118    print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'};
    106119
    107     # create a new document object
     120    #
     121    # Parse the document's text and extract metadata
     122    #
     123
     124    # Separate header from body of message
     125    my $Headers = $text;
     126    $Headers =~ s/\n\n.*//s;
     127    $text = substr $text, (length $Headers);
     128
     129    # Extract basic metadata from header
     130    my @headers = ("From", "To", "Subject", "Date");
     131    my $value = "";
     132    my %raw;
     133
     134    foreach my $name (@headers) {
     135    $value = $Headers;
     136    $value =~ s/.*$name://s;
     137    $value =~ s/\S*:.*//s;
     138    $value =~ s/\s*$//;
     139    $value =~ s/\s+/ /g;
     140    $raw{$name} = $value;
     141    }
     142
     143    # Process Date information
     144    if ($raw{"Date"}) {
     145    $raw{"DateText"} = $raw{"Date"};
     146   
     147    # Convert the date text to internal date format
     148    $value = $raw{"Date"};
     149    my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
     150    if ($year < 100) { $year += 1900; }
     151    $raw{"Date"} = &sorttools::format_date($day, $month, $year);
     152   
     153    } else {
     154    # We have not extracted a date
     155    $raw{"DateText"} = "Unknown.";
     156    $raw{"Date"} = "19000000";
     157    }
     158
     159
     160    #
     161    # Create a new document object
     162    #
     163
    108164    my $doc_obj = new doc ($file, "indexed_doc");
    109     open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
    110165    my $cursection = $doc_obj->get_top_section();
    111166
    112     # Metadata fields
    113     my $Subject = "";
    114     my $To = "";
    115     my $From = "";
    116     my $DateText = "";
    117     my $Date = "";
    118     my $OtherHeaders = "";
    119     my $NewText = "";
    120     my $text = "";
    121     my $line = "";
    122 
    123     my $headers_read = 0;
    124    
    125     # Read and process each line in te email file.
    126     # Each file consists of a set of header lines, then a blank line,
    127     # then the body of the email.
    128     while (<FILE>) {
    129    
    130     $line = $_;
    131    
    132     # Remove carriage returns from the line.
    133     # We will later replace single cariage returns with <BR> tags
    134     # and double carriage returns with <P> tags.
    135     $line =~ s/\n/ /g;
    136 
    137         if ($headers_read) {
    138         # The headers have been read, so add this line to the body text
    139         $text .= "$line\n";
    140             # If the line isn't quoted, add it to the NewText metadata
    141         if ($line =~ /^[^>|]/) {
    142         $NewText .= "$line\n";
    143         }
    144        
    145     } elsif ($line =~ /^\s*$/) {
    146         # An empty line signals the end of the headers.
    147         $headers_read = 1;
    148 
     167    # Add specilised metadata
     168    foreach my $name (keys %raw) {
     169    $value = $raw{$name};
     170    if ($value) {
     171        $value = &text_into_html($value);
    149172    } else {
    150         # Read a line of header information and add it to the metadata
    151         $line .= "\n";
    152         if ($line =~ /^From:/) {
    153         $line =~ s/^From:\s*//;
    154         $From .= $line;
    155         } elsif ($line =~ /^To:/) {
    156         $line =~ s/^To:\s*//;
    157         $To .= $line;
    158         } elsif ($line =~ /^Date:/) {
    159         $line =~ s/^Date:\s*//;
    160         $DateText .= $line;
    161         if ($Date !~ /\d+/) {
    162             # Convert the date text to internal date format
    163             my ($day, $month, $year) = $line =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d\d)/;
    164             $Date = &sorttools::format_date($day, $month, $year);
    165         }
    166         } elsif ($line =~ /^Subject:/) {
    167         $line =~ s/^Subject:\s*//;
    168         $Subject .= $line;
    169         } else {
    170         $OtherHeaders .= $line;
    171         }
    172     }   
     173        $value = "No $name field";
     174    }
     175    $doc_obj->add_metadata ($cursection, $name, $value);
    173176    }
    174177
    175     # Add Subject metadata
    176     $Subject = &text_into_html($Subject);
    177     $Subject = "No Subject" unless ($Subject =~ /\w/);
    178     $doc_obj->add_metadata ($cursection, "Subject", $Subject);
    179    
    180     # Add Sender
    181     $From = &text_into_html($From);
    182     $From = "No Sender" unless ($From =~ /\w/);
    183     $doc_obj->add_metadata ($cursection, "Creator", $From);
    184 
    185     # Add Recipient
    186     $To = &text_into_html($To);
    187     $To = "No Recipient" unless ($To =~ /\w/);
    188     $doc_obj->add_metadata ($cursection, "To", $To);
    189 
    190     # Add Date Text
    191     $DateText =~ &text_into_html($Date);
    192     $doc_obj->add_metadata ($cursection, "DateText", $DateText) if ($DateText =~ /\w/);
    193 
    194     # Add Date
    195     $Date =~ &text_into_html($Date);
    196     $doc_obj->add_metadata ($cursection, "Date", $Date) if ($Date =~ /\w/);
    197 
    198     # Add Other Headers
    199     $OtherHeaders = &text_into_html($OtherHeaders);
    200     $doc_obj->add_metadata ($cursection, "OtherHeaders", $OtherHeaders) if ($OtherHeaders =~ /\w/);
    201    
    202     # Add New Text
    203     $NewText = &text_into_html($NewText);
    204     $doc_obj->add_metadata ($cursection, "NewText", $NewText) if ($NewText =~ /\w/);
    205 
    206     # Add text
    207     $text =~ s/<BR>\s*<BR>/<P>/g;
     178    # Add "All headers" metadata
     179    $Headers = &text_into_html($Headers);
     180    $Headers = "No headers" unless ($Headers =~ /\w/);
     181    $doc_obj->add_metadata ($cursection, "Headers", $Headers);
     182
     183    # Add document text
    208184    $text = &text_into_html($text);
    209     $doc_obj->add_text ($cursection, $text) if ($text =~ /\w/);
     185    $text = "No message" unless ($text =~ /\w/);
     186    $doc_obj->add_text ($cursection, $text);
    210187   
    211188    # Add the OID - that is, the big HASH value used as a unique ID
     
    215192    $processor->process($doc_obj);
    216193
    217     return 1; # processed the file
    218 }
    219 
    220 
    221 1;
    222 
    223 
    224 
    225 #
     194    # Return the number of documents processed
     195    return 1;
     196
     197}
     198
     199
    226200# Convert a text string into HTML.
    227201#
     
    234208# and replaces carriage returns with <BR> tags (and multiple carriage
    235209# returns with <P> tags).
    236 #
     210
    237211
    238212sub text_into_html {
    239213    my ($text) = @_;
    240214
    241 
    242215    # Convert problem charaters into HTML symbols
    243     $text =~ s/&/&amp;/g;
    244     $text =~ s/</&lt;/g;
    245     $text =~ s/>/&gt;/g;
    246     $text =~ s/\"/&quot;/g;
     216    $text =~ s/&/&amp;/go;
     217    $text =~ s/</&lt;/go;
     218    $text =~ s/>/&gt;/go;
     219    $text =~ s/\"/&quot;/go;
    247220
    248221    # convert email addresses and URLs into links
    249222    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
    250     $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
     223    $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
    251224
    252225    # Clean up whitespace and convert \n charaters to <BR> or <P>
    253     $text =~ s/ +/ /g;
    254     $text =~ s/\s*$//;
    255     $text =~ s/^\s*//;
    256     $text =~ s/\n/\n<BR>/g;
    257     $text =~ s/<BR>\s*<BR>/<P>/g;
     226    $text =~ s/ +/ /go;
     227    $text =~ s/\s*$//o;
     228    $text =~ s/^\s*//o;
     229    $text =~ s/\n/\n<BR>/go;
     230    $text =~ s/<BR>\s*<BR>/<P>/go;
    258231
    259232    return $text;
    260233}
    261234
    262    
    263 
    264 
    265 
    266 
    267 
    268 
     235
     236# Perl packages have to return true if they are run.
     2371;
     238   
     239
     240
     241
     242
     243
     244
Note: See TracChangeset for help on using the changeset viewer.