Changeset 3630 for trunk/gsdl/perllib


Ignore:
Timestamp:
2002-12-10T17:58:13+13:00 (22 years ago)
Author:
jrm21
Message:

1) Correct typo in print_usage(): process_exp -> split_exp

2) Fixed up John T's option stuff as they had the wrong default values!

3) Now save attachments. They are not indexed, but can be downloaded. These

are extracted to $GSDLHOME/tmp but there is no way to delete them from
there as they aren't linked into archives directory until we're done here!

4) Added option "-no_attachments" if you don't want the attachments to be

downloadable. Also added option documentation... (!!)

5) bugfixes, including:

a) problems with nested message attachments losing content
b) printing too many headers with attached messages
c) printing to STDERR instead of the $outhandle.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r3627 r3630  
    77# University of Waikato, New Zealand.
    88#
    9 # Copyright (C) 1999-2001 New Zealand Digital Library Project
     9# Copyright (C) 1999-2002 New Zealand Digital Library Project
    1010#
    1111# This program is free software; you can redistribute it and/or modify
     
    8383    print STDERR "\n  usage: plugin EMAILPlug [options]\n\n";
    8484    print STDERR "  options:\n";
    85     print STDERR "   -process_exp      A perl regular expression used to split files\n";
     85    print STDERR "   -split_exp        A perl regular expression used to split files\n";
    8686    print STDERR "                     containing many messages into individual documents.\n\n";
     87    print STDERR "   -no_attachments   Do not save message attachments.\n\n";
    8788
    8889}
     
    9394    'type' => "string",
    9495    'reqd' => "no",
    95     'deft' => q^(?i)\.hb$^} ,
    96 {   'name' => "block_exp",
     96    'deft' => q@([\\/]\d+|\.(mbx|email|eml))$@
     97  },
     98  {     'name' => "no_attachments",
     99    'desc' => "Do not save message attachments.",
     100    'type' => "flag",
     101    'reqd' => "no"
     102  },
     103  {     'name' => "block_exp",
    97104    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    98105    'type' => "string",
    99106    'reqd' => "no",
    100     'deft' => q^\.jpg$^}
     107    'deft' => q^^}
    101108];
    102109
     
    121128    if (!parsargv::parse(\@_,
    122129             q^split_exp/.*/^, \$self->{'split_exp'},
     130             q^no_attachments^, \$self->{'ignore_attachments'},
    123131             "allow_extra_options")) {
    124132    print STDERR "\nIncorrect options passed to $class.";
     
    126134    die "\n";
    127135    }
     136    $self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
     137
    128138    # this might not actually be true at read-time, but after processing
    129139    # it should all be utf8.
     
    351361    # more than one parameter given to Content-type.
    352362    # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
    353     if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*(\;\s*.*)\s*$/mi)
     363    if ($Headers =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;\s*.+)?\s*$@mi)
    354364    {
    355365        $mimetype=$1;
     
    379389    $$textref= $self->text_from_part("$Headers\n$$textref");
    380390    } elsif ($mimetype ne "text/plain") {
     391    $self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
    381392    $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
    382393                         $outhandle);
     
    556567            $part_body=$message_part;
    557568            $part_header="Content-type: text/plain; charset=us-ascii";
    558         } elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
     569        } elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
    559570            $part_body=$1;
    560571        } else {
     
    571582            $part_content_type="message/rfc822";
    572583        }
    573         if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*(.*?)\s*$@mi) {
     584        if ($part_header =~ m@^content\-type:\s*([\w\.\-/]+)\s*\;?\s*(.*?)\s*$@mi) {
    574585            $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
    575586            $part_content_info=$2;
     
    594605            $text.="</strong></p>\n";
    595606        }
    596 
    597607        if ($part_content_type =~ m@text/@)
    598608        {
     
    608618            # This is a forwarded message
    609619            my $message_part_headers=$part_body;
    610             $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
    611620            $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
    612621            my $message_part_body=$1;
     622            $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
    613623
    614624            my $rfc822_formatted_body=""; # put result in here
    615625            if ($message_part_headers =~
    616             /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.*?)\s*$/ims)
     626            /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.*?)\s*$/ims)
    617627            {
    618628            # The message header uses MIME flags
     
    639649            # Add the returned text to the output
    640650            # don't put all the headers...
    641             $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
    642             $text.=text_into_html($message_part_headers);
     651#           $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
     652            my $brief_headers=get_brief_headers($message_part_headers);
     653            $text.=text_into_html($brief_headers);
    643654            $text.="<p>\n";
    644655            $text.=$rfc822_formatted_body;
     656            $text.="</p>\n";
    645657            # end of message/rfc822
    646658        } elsif ($part_content_type =~ /multipart/) {
     
    652664                                $outhandle);
    653665            $text.=$tmptext;
    654         } elsif ($text eq "") {
    655             # we can't do anything with this part, but if it's the first
    656             # part then make sure it is mentioned..
     666        } else {
     667            # this part isn't text/* or another message...
     668            if ($text eq "") {
     669            # this is the first part of a multipart, or only part!
     670            $text="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
     671            # add part info header
     672            $text.="<br>Type: $part_content_type<br>\n";
     673            $text.="Filename: $filename</strong></p>\n";
     674            }
     675
     676            # save attachment by default
     677            if (!$self->{'ignore_attachments'}
     678            && $filename ne "") { # this part has a file...
     679            my $encoding="8bit";
     680            if ($part_header =~
     681                /^content-transfer-encoding:\s*(\w+)/mi ) {
     682                $encoding=$1; $encoding =~ tr/A-Z/a-z/;
     683            }
     684            my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
     685            my $save_filename="$filename";
     686
     687            # make sure we don't clobber files with same name;
     688            # need to keep state between .mbx files
     689            my $assoc_files=$self->{'assoc_filenames'};
     690            if ($assoc_files{$filename}) { # it's been set...
     691                $assoc_files{$filename}++;
     692                $filename =~ m/(.+)\.(\w+)$/;
     693                my ($filestem, $ext)=($1,$2);
     694                $save_filename="${filestem}_"
     695                . $assoc_files{$filename} . ".$ext";
     696            } else { # first file with this name
     697                $assoc_files{$filename}=1;
     698            }
     699            open (SAVE, ">$tmpdir/$save_filename") ||
     700                warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
     701            $part_text=$message_part;
     702            $part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
     703            if ($encoding eq "base64") {
     704                print SAVE base64_decode($part_text);
     705            } elsif ($encoding eq "quoted-printable") {
     706                print SAVE qp_decode($part_text);
     707            } else { # 7bit, 8bit, binary, etc...
     708                print SAVE $part_text;
     709            }
     710            close SAVE;
     711            my $doc_obj=$self->{'doc_obj'};
     712            $doc_obj->associate_file("$tmpdir/$save_filename",
     713                         "$save_filename",
     714                         $part_content_type # mimetype
     715                         );
     716            # clean up tmp area...
     717            # Can't do this as it hasn't been copied/linked yet!!!
     718#           &util::rm("$tmpdir/$save_filename");
     719            print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
    657720           
    658             $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
    659             # add part info header
    660             $text.="<br>Type: $part_content_type<br>\n";
    661             if ($filename ne "") {
    662             $text.="Filename: $filename\n";
    663             }
    664             $text.="</strong></p>\n";
    665         }
     721            # be nice if "download" was a translatable macro :(
     722            $text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
     723            } # end of save attachment
     724        } # end of !text/message part
    666725        } # foreach message part.
    667726    } else {
     
    675734    $text = $1;
    676735
    677     if ($msg_header =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
     736    if ($msg_header =~ /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.+?)\s*$/mi)
    678737    {
    679738        $mimetype=$1;
     
    689748        } else {$msg_text=$text;}
    690749
    691         my $brief_header="";
    692         if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
    693         if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
    694         if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
    695         if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
    696         if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
     750        my $brief_header=text_into_html(get_brief_headers($msg_header));
    697751        $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
    698752        $text.= "<table><tr><td width=\"5%\"> </td>\n";
     
    708762
    709763
    710 
    711 
     764# Return only the "important" headers from a set of message headers
     765sub get_brief_headers {
     766    my $msg_header = shift;
     767    my $brief_header = "";
     768
     769    # Order matters!
     770    if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
     771    if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
     772    if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
     773    if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
     774    if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
     775
     776    return $brief_header;
     777}
    712778
    713779
     
    726792    }
    727793    $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
    728     $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
     794    $part_header =~ /content\-type:\s*([\w\.\-\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
    729795    my $type=$1;
    730796    my $charset=$2;
     
    743809    } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
    744810        # rfc2045 also allows binary, which we ignore (for now).
    745         # maybe this shouldn't go to stderr, but anyway...
    746         print STDERR "EMAILPlug: unknown encoding: $encoding\n";
     811        my $outhandle=$self->{'outhandle'};
     812        print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
    747813        return "";
    748814    }
Note: See TracChangeset for help on using the changeset viewer.