Ignore:
Timestamp:
2000-07-13T10:21:53+12:00 (24 years ago)
Author:
sjboddie
Message:

merged changes to trunk into New_Config_Format branch

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/New_Config_Format-branch/gsdl/perllib/plugins/EMAILPlug.pm

    r638 r1279  
    2626
    2727
    28 #
    29 # EMAILPlug reads an email file (*.email)
    30 #
    31 # Version 1.1   1999 Sep 20  by Gordon Paynter ([email protected])
    32 #                            loosely based on the original HTMLPlug code
     28
     29# EMAILPlug
     30#
     31# by Gordon Paynter ([email protected])
     32#
     33# Email plug reads email files.  These are named with a simple
     34# number (i.e. as they appear in mh_mail folders) or with the
     35# extension .email
    3336#
    3437# Document text:
    35 #   The document text consists of all the text occuring after the first
    36 #   blank line in this document.
     38#   The document text consists of all the text
     39#   after the first blank line in the document.
    3740#
    3841# Metadata:
     42#   $Headers      All the header content
    3943#   $Subject      Subject: header
    4044#   $To           To: header
     
    4246#   $DateText     Date: header
    4347#   $Date         Date: header in GSDL format (eg: 19990924)
    44 #   $OtherHeaders All the other headers
    45 #   $NewText      The unquoted text in this message
     48#
     49# Version history
     50#
     51# 1.2   (2000 Jun 12) Major rewrite.
     52#       (The new version of Greenstone breaks some of the metadata.)
     53# 1.1.1 Compensated for two-digit years like "95"
     54# 1.1   (1999 Sep 20) Introduced the various metadata fileds
     55# 1.0   Based on the original HTMLPlug code
    4656#
    4757
     
    5666# EMAILPlug is a sub-class of BasPlug.
    5767
    58 sub BEGIN {
     68sub BEGIN { 
    5969    @ISA = ('BasPlug');
    6070}
    6171
     72use strict;
    6273
    6374# Create a new EMAILPlug object with which to parse a file.
    64 # This is done by creating a new BasPlug and usig bless to
     75# Accomplished by creating a new BasPlug and using bless to
    6576# turn it into an EMAILPlug.
    6677
    6778sub new {
    6879    my ($class) = @_;
    69     $self = new BasPlug ();
     80    my $self = new BasPlug ("EMAILPlug", @_);
    7081
    7182    return bless $self, $class;
    7283}
    7384
    74 
    75 # Is the EMAILPlug recursive?  No.
    76 
    77 sub is_recursive {
     85sub get_default_process_exp {
    7886    my $self = shift (@_);
    7987
    80     return 0; # this is not a recursive plugin
    81 }
    82 
    83 
    84 #
    85 # read
    86 #
    87 # read attempts to read a file and store its contents in a
    88 # new document object.
    89 #
    90 # Returns: number of files processed or undef if can't process
    91 # This plugin only processes one file at a time.
    92 #
    93 # Note: $base_dir might be "" and $file might include directories,
    94 # but that doesn't affect EMAILPlug
    95 #
    96 
    97 sub read {
     88    return q^\d+(\.email)?$^;
     89}
     90
     91# do plugin specific processing of doc_obj
     92sub process {
    9893    my $self = shift (@_);
    99     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    100 
    101     # Make sure file exists and is an email file
    102     my $filename = &util::filename_cat($base_dir, $file);
    103     return undef unless ($filename =~ /\.email$/i && (-e $filename));
    104 
    105     print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'};
    106 
    107     # create a new document object
    108     my $doc_obj = new doc ($file, "indexed_doc");
    109     open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
     94    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     95   
     96    # Check that we're dealing with a valid mail file
     97    return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/));
     98
     99    print STDERR "EMAILPlug: processing $file\n"
     100    if $self->{'verbosity'} > 1;
     101
    110102    my $cursection = $doc_obj->get_top_section();
    111103
    112     # Metadata fields
    113     my $Subject = "";
    114     my $To = "";
    115     my $From = "";
    116     my $DateText = "";
    117     my $Date = "";
    118     my $OtherHeaders = "";
    119     my $NewText = "";
    120     my $text = "";
    121     my $line = "";
    122 
    123     my $headers_read = 0;
    124    
    125     # Read and process each line in te email file.
    126     # Each file consists of a set of header lines, then a blank line,
    127     # then the body of the email.
    128     while (<FILE>) {
     104    #
     105    # Parse the document's text and extract metadata
     106    #
     107
     108    # Separate header from body of message
     109    my $Headers = $$textref;
     110    $Headers =~ s/\n\n.*//s;
     111    $$textref = substr $$textref, (length $Headers);
     112
     113    # Extract basic metadata from header
     114    my @headers = ("From", "To", "Subject", "Date");
     115    my $value = "";
     116    my %raw;
     117
     118    foreach my $name (@headers) {
     119    $value = $Headers;
     120    $value =~ s/.*$name://s;
     121    $value =~ s/\S*:.*//s;
     122    $value =~ s/\s*$//;
     123    $value =~ s/\s+/ /g;
     124    $raw{$name} = $value;
     125    }
     126
     127    # Process Date information
     128    if ($raw{"Date"}) {
     129    $raw{"DateText"} = $raw{"Date"};
    129130   
    130     $line = $_;
     131    # Convert the date text to internal date format
     132    $value = $raw{"Date"};
     133    my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
     134    if ($year < 100) { $year += 1900; }
     135    $raw{"Date"} = &sorttools::format_date($day, $month, $year);
    131136   
    132     # Remove carriage returns from the line.
    133     # We will later replace single cariage returns with <BR> tags
    134     # and double carriage returns with <P> tags.
    135     $line =~ s/\n/ /g;
    136 
    137         if ($headers_read) {
    138         # The headers have been read, so add this line to the body text
    139         $text .= "$line\n";
    140             # If the line isn't quoted, add it to the NewText metadata
    141         if ($line =~ /^[^>|]/) {
    142         $NewText .= "$line\n";
    143         }
    144        
    145     } elsif ($line =~ /^\s*$/) {
    146         # An empty line signals the end of the headers.
    147         $headers_read = 1;
    148 
     137    } else {
     138    # We have not extracted a date
     139    $raw{"DateText"} = "Unknown.";
     140    $raw{"Date"} = "19000000";
     141    }
     142
     143
     144    # Add extracted metadata to document object
     145    foreach my $name (keys %raw) {
     146    $value = $raw{$name};
     147    if ($value) {
     148        $value = &text_into_html($value);
    149149    } else {
    150         # Read a line of header information and add it to the metadata
    151         $line .= "\n";
    152         if ($line =~ /^From:/) {
    153         $line =~ s/^From:\s*//;
    154         $From .= $line;
    155         } elsif ($line =~ /^To:/) {
    156         $line =~ s/^To:\s*//;
    157         $To .= $line;
    158         } elsif ($line =~ /^Date:/) {
    159         $line =~ s/^Date:\s*//;
    160         $DateText .= $line;
    161         if ($Date !~ /\d+/) {
    162             # Convert the date text to internal date format
    163             my ($day, $month, $year) = $line =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d\d)/;
    164             $Date = &sorttools::format_date($day, $month, $year);
    165         }
    166         } elsif ($line =~ /^Subject:/) {
    167         $line =~ s/^Subject:\s*//;
    168         $Subject .= $line;
    169         } else {
    170         $OtherHeaders .= $line;
    171         }
    172     }   
     150        $value = "No $name field";
     151    }
     152    $doc_obj->add_utf8_metadata ($cursection, $name, $value);
    173153    }
    174154
    175     # Add Subject metadata
    176     $Subject = &text_into_html($Subject);
    177     $Subject = "No Subject" unless ($Subject =~ /\w/);
    178     $doc_obj->add_metadata ($cursection, "Subject", $Subject);
    179    
    180     # Add Sender
    181     $From = &text_into_html($From);
    182     $From = "No Sender" unless ($From =~ /\w/);
    183     $doc_obj->add_metadata ($cursection, "Creator", $From);
    184 
    185     # Add Recipient
    186     $To = &text_into_html($To);
    187     $To = "No Recipient" unless ($To =~ /\w/);
    188     $doc_obj->add_metadata ($cursection, "To", $To);
    189 
    190     # Add Date Text
    191     $DateText =~ &text_into_html($Date);
    192     $doc_obj->add_metadata ($cursection, "DateText", $DateText) if ($DateText =~ /\w/);
    193 
    194     # Add Date
    195     $Date =~ &text_into_html($Date);
    196     $doc_obj->add_metadata ($cursection, "Date", $Date) if ($Date =~ /\w/);
    197 
    198     # Add Other Headers
    199     $OtherHeaders = &text_into_html($OtherHeaders);
    200     $doc_obj->add_metadata ($cursection, "OtherHeaders", $OtherHeaders) if ($OtherHeaders =~ /\w/);
    201    
    202     # Add New Text
    203     $NewText = &text_into_html($NewText);
    204     $doc_obj->add_metadata ($cursection, "NewText", $NewText) if ($NewText =~ /\w/);
    205 
    206     # Add text
    207     $text =~ s/<BR>\s*<BR>/<P>/g;
    208     $text = &text_into_html($text);
    209     $doc_obj->add_text ($cursection, $text) if ($text =~ /\w/);
    210    
    211     # Add the OID - that is, the big HASH value used as a unique ID
    212     $doc_obj->set_OID ();
    213 
    214     # Process the document
    215     $processor->process($doc_obj);
    216 
    217     return 1; # processed the file
    218 }
    219 
    220 
    221 1;
    222 
    223 
    224 
    225 #
     155    # Add "All headers" metadata
     156    $Headers = &text_into_html($Headers);
     157    $Headers = "No headers" unless ($Headers =~ /\w/);
     158    $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
     159
     160    # Add text to document object
     161    $$textref = &text_into_html($$textref);
     162    $$textref = "No message" unless ($$textref =~ /\w/);
     163    $doc_obj->add_utf8_text($cursection, $$textref);
     164
     165    return 1;
     166}
     167
     168
    226169# Convert a text string into HTML.
    227170#
     
    234177# and replaces carriage returns with <BR> tags (and multiple carriage
    235178# returns with <P> tags).
    236 #
     179
    237180
    238181sub text_into_html {
    239182    my ($text) = @_;
    240183
    241 
    242     # Convert problem charaters into HTML symbols
    243     $text =~ s/&/&amp;/g;
    244     $text =~ s/</&lt;/g;
    245     $text =~ s/>/&gt;/g;
    246     $text =~ s/\"/&quot;/g;
     184    # Convert problem characters into HTML symbols
     185    $text =~ s/&/&amp;/go;
     186    $text =~ s/</&lt;/go;
     187    $text =~ s/>/&gt;/go;
     188    $text =~ s/\"/&quot;/go;
    247189
    248190    # convert email addresses and URLs into links
    249191    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
    250     $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1">$1<\/a>/g;
     192    $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
    251193
    252194    # Clean up whitespace and convert \n charaters to <BR> or <P>
    253     $text =~ s/ +/ /g;
    254     $text =~ s/\s*$//;
    255     $text =~ s/^\s*//;
    256     $text =~ s/\n/\n<BR>/g;
    257     $text =~ s/<BR>\s*<BR>/<P>/g;
     195    $text =~ s/ +/ /go;
     196    $text =~ s/\s*$//o;
     197    $text =~ s/^\s*//o;
     198    $text =~ s/\n/\n<BR>/go;
     199    $text =~ s/<BR>\s*<BR>/<P>/go;
    258200
    259201    return $text;
    260202}
    261203
    262    
    263 
    264 
    265 
    266 
    267 
    268 
     204
     205# Perl packages have to return true if they are run.
     2061;
Note: See TracChangeset for help on using the changeset viewer.