Changeset 33717


Ignore:
Timestamp:
2019-11-22T10:44:13+13:00 (4 years ago)
Author:
davidb
Message:

Code tidy up; better error checking on running Java cmd; public/private key for TotalVirus added; sleep added if public key

Location:
gs2-extensions/malware-checker/trunk/perllib/plugins
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/malware-checker/trunk/perllib/plugins/MalwareCheckerConverter.pm

    r33676 r33717  
    179179
    180180
    181 sub convert {
    182     my $self = shift(@_);
    183     my ($source_file_full_path, $target_file_type) = @_;
    184 
    185     return 0 unless $malwarechecker_conversion_available;
    186     # check the filename
    187     return 0 if ( !-f $source_file_full_path);
    188 
    189     # Although PDFBoxConverter inherits from AutoLoadConverters and therefore
    190     # doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options
    191     # in case in future PDFBoxConverter no longer inherits from AutoLoadConverters
    192     # and ends up going through gsConvert.pl
    193     $self->{'convert_options'} .= " -pdf_tool pdfbox";
    194 
    195     my $img_output_mode = 0;
    196    
    197     my $convert_to = $self->{'convert_to'};
    198     my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0;
    199    
    200     # the following line is necessary to avoid 'uninitialised variable' error
    201     # messages concerning the converted_to member variable when PDFPlugin's
    202     # use_sections option is checked.
    203     # PDFBox plugin now processes use_sections option, when working with v1.5.0
    204     # of the PDFBox jar file (which embeds each page in special <div> tags).
    205     if ($target_file_type eq "html") {
    206     $self->{'converted_to'} = "HTML";
    207     } elsif ($target_file_type eq "jpg" || $target_file_type eq "png") { # || $target_file_type eq "gif"
    208     # GIF not supported by PDFBox at present, see https://pdfbox.apache.org/1.8/commandline.html#pdftoimage
    209     $self->{'converted_to'} = $target_file_type;   
    210     $img_output_mode = 1;
    211     } else {
    212     $self->{'converted_to'} = "text";
    213     }
    214 
    215     my $outhandle = $self->{'outhandle'};
    216     my $verbosity = $self->{'verbosity'};
    217 
    218     my $source_file_no_path = &File::Basename::basename($source_file_full_path);
    219     # Determine the full name and path of the output file
    220     my $target_file_path;
    221     if ($self->{'enable_cache'}) {
    222     $self->init_cache_for_file($source_file_full_path);
    223     my $cache_dir = $self->{'cached_dir'};
    224     my $file_root = $self->{'cached_file_root'};
    225     #$file_root .= "_$convert_id" if ($convert_id ne "");
    226 
    227     # append the output filetype suffix only for non-image output formats, since for
    228     # images we can be outputting multiple image files per single PDF input file
    229     my $target_file = ($img_output_mode || $paged_txt_output_mode) ? "$file_root" : "$file_root.$target_file_type";
    230 
    231     $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
     181sub checker {
     182    my $self = shift(@_);
     183    my ($doc_obj, $source_file_full_path) = @_;
     184
     185    print STDERR "***** @@@@@@ !!!!!!! MalwareCheckerConvert::convert called!\n";
     186
     187
     188    print STDERR "**** want to check: $source_file_full_path\n";
     189
     190    my $virustotal_apikey = $self->{'virustotal_apikey'};
     191 
     192    # Build up command along the lines:
     193    #   java -cp $ENV{'GEXT_MALWARECHECK'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile
     194
     195    my $jar_file = &FileUtils::filenameConcatenate($ENV{'GEXT_MALWARECHECKER'},"lib","java","malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar");
     196       
     197    my $checker_cmd = "java -cp $jar_file org.greenstone.virustotal.ScanFile $virustotal_apikey $source_file_full_path";
     198   
     199    my $print_info = { 'message_prefix' => "MalwareChecker Conversion",
     200               'message' => "Checking $source_file_full_path" };
     201    # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
     202
     203    #my $target_file_path = ".virustotal-resourceid-" + $source_file_full_path;
     204    my $target_file_path = $source_file_full_path . "-VIRUSTOTAL-RESOURCEID";
     205   
     206    my ($regenerated,$result,$had_error)
     207    = $self->autorun_general_cmd($checker_cmd,$source_file_full_path,$target_file_path,$print_info);
     208
     209    if ($had_error) {
     210    print STDERR "Error: Failed to run cmd: $checker_cmd\n";
    232211    }
    233212    else {
    234     # this is in gsdl/tmp. get a tmp filename in collection instead???
    235     $target_file_path = &util::get_tmp_filename($target_file_type);
    236 
    237     # for image files, remove the suffix, since we can have many output image files
    238     # per input PDF (one img for each page of the PDF, for example)
    239     if($img_output_mode || $paged_txt_output_mode) {
    240         $target_file_path =~ s/\.[^.]*$//g;
    241         if(!&FileUtils::directoryExists($target_file_path)) {       
    242         mkdir($target_file_path);
     213    chomp($result);
     214    my $top_section = $doc_obj->get_top_section();
     215    $doc_obj->add_utf8_metadata($top_section, "VirusTotalResourceID", $result);
     216    }
     217
     218    # Need to regulate how often we make calls to VirusTotal API
     219    # If public key, then 4 per min
     220
     221    if ($self->{'virustotal_keytype'} eq "public") {
     222### if ($regenerated) {
     223        my $verbosity = $self->{'verbosity'};
     224        if ($verbosity >= 1) {
     225        my $outhandle = $self->{'outhandle'};
     226        print $outhandle "Sleeping for 15 secs, in accordance with public key VirusTotal API Terms and Conditions\n";
    243227        }
    244        
    245         # once the item file for the imgs has been created, need to adjust target_file_path
    246 
    247         # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
    248         # item file generated in it can be deleted in one go on clean_up
    249     }
    250    
    251     push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
    252     }
    253 
    254     # Generate and run the convert command
    255     my $convert_cmd = "";
    256 
    257     # want the filename without extension, because any images
    258     # are to be generated with the same filename as the PDF
    259     my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
    260 
    261     if($img_output_mode || $paged_txt_output_mode) { # converting each page to image and/or text
    262     my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
    263 
    264     # Our custom class does renaming of the pages (simplified to just numbers) for PagedImagePlugin
    265     #$convert_cmd = $paged_txt_output_mode ? $self->{'malwarechecker_imgtxt_launch_cmd'} : $self->{'malwarechecker_img_launch_cmd'};
    266     $convert_cmd = $self->{'malwarechecker_imgtxt_launch_cmd'};
    267     $convert_cmd .= " -imagesOnly" unless($paged_txt_output_mode); # set to images only unless there's text too
    268     if($img_output_mode) { # whether images-only or images-and-text mode
    269         $convert_cmd .= " -imageType $target_file_type";
    270         $convert_cmd .= " -dpi ". $self->{"dpi"} if defined $self->{"dpi"};
    271     } else { # img_output_mode off, so paged txt only and no images
    272         $convert_cmd .= " -textOnly";       
    273     }
    274     $convert_cmd .= " -outputPrefix \"$output_prefix\"";
    275     $convert_cmd .= " \"$source_file_full_path\"";
    276    
    277     } else { # single stream of text or html
    278    
    279     if ($target_file_type eq "html") {
    280         $convert_cmd = $self->{'malwarechecker_html_launch_cmd'};
    281         $convert_cmd .= " -html" if ($target_file_type eq "html");
    282     } else {
    283         $convert_cmd = $self->{'malwarechecker_txt_launch_cmd'};
    284     }
    285     $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
    286     }
    287 
    288     if ($verbosity>2) {
    289     &gsprintf($outhandle,"Convert command: $convert_cmd\n");
    290     }
    291 
    292     my $print_info = { 'message_prefix' => "PDFBox Conversion",
    293                'message' => "Converting $source_file_no_path to: $target_file_type" };
    294     # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
    295 
    296     my ($regenerated,$result,$had_error)
    297     = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
    298 
    299     if($img_output_mode || $paged_txt_output_mode) {
    300     # now the images have been generated, generate the "$target_file_path/tailname.item"
    301     # item file for them, which is also the target_file_path that needs to be returned
    302     $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
    303     #print STDERR "**** item file: $target_file_path\n";
    304     }
    305     elsif ($self->{'converted_to'} eq "text") {
    306     # ensure html entities are doubly escaped for pdfbox to text conversion: &amp; -> &amp;amp;
    307     # conversion to html does it automatically, but conversion to text doesn't
    308     # and this results in illegal characters in doc.xml
    309 
    310     my $fulltext = &FileUtils::readUTF8File($target_file_path);
    311     if(defined $fulltext) {
    312         #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
    313         $fulltext =~ s@&@&amp;@sg; # Kathy's fix to ensure doc contents don't break XML
    314         &FileUtils::writeUTF8File($target_file_path, \$fulltext);
    315     } else {
    316         print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n";
    317         $had_error = 1;
    318     }
    319     }
    320 
    321     if ($had_error) {
    322     return (0, $result,$target_file_path);
    323     }
    324     return (1, $result,$target_file_path);
    325 }
    326 
    327 sub convert_without_result {
    328     my $self = shift(@_);
    329 
    330     my $source_file_path = shift(@_);
    331     my $target_file_type = shift(@_);
    332     my $convert_options  = shift(@_) || "";
    333     my $convert_id       = shift(@_) || "";
    334 
    335     return $self->convert($source_file_path,$target_file_type,
    336               $convert_options,$convert_id,"without_result");
     228        sleep(15);
     229### }
     230    }       
    337231}
    338232
  • gs2-extensions/malware-checker/trunk/perllib/plugins/PDFv3Plugin.pm

    r33676 r33717  
    112112     ];
    113113
     114my $virustotal_keytype_list =
     115    [ { 'name' => "public",
     116    'desc' => "{PDFv3Plugin.virustotal_type_list.public}" },
     117      { 'name' => "private",
     118    'desc' => "{PDFv3Plugin.virustotal_type_list.private}" }
     119    ];
     120
     121     
    114122my $opt_malwarechecker_args = [ { 'name' => "malware_checker",
    115123                  'desc' => "{MalwareChecker.malware_checker}",
    116124                  'type' => "flag",               
    117                   'reqd' => "no" } ];
    118 
    119 
    120 my $options = { 'name'     => "PDFv2Plugin",
     125                  'reqd' => "no" },
     126                { 'name' => "virustotal_keytype",
     127                  'desc' => "{PDFv3Plugin.virustotal_keytype}",
     128                  'type' => "enum",
     129                  'list' => $virustotal_keytype_list,                 
     130                  'deft' => "public" },
     131                { 'name' => "virustotal_apikey",
     132                  'desc' => "{PDFv3Plugin.virustotal_apikey}",
     133                  'type' => "string",
     134                  'deft' => "CHANGEME" }
     135    ];
     136
     137
     138my $options = { 'name'     => "PDFv3Plugin",
    121139        'desc'     => "{PDFv2Plugin.desc}",
    122140        'abstract' => "no",
     
    756774    print STDERR "\n\n\n\n\n***** PDFv3Plugin::process()\n";
    757775    if ($self->{'malware_checker'}) {
     776
    758777    my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
    759     print STDERR "**** want to check: $filename\n";
    760 
    761     # java -cp $ENV{'GEXT_MALWARECHECK'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile
    762 
    763     my $cmd = "java -cp $ENV{'GEXT_MALWARECHECKER'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile $filename";
    764     my $status = system($cmd);
    765     print "STATUS = $status\n";
     778
     779    $self->MalwareCheckerConverter::checker($doc_obj,$filename);
    766780    }
    767781   
Note: See TracChangeset for help on using the changeset viewer.