Changeset 33717

Show
Ignore:
Timestamp:
22.11.2019 10:44:13 (2 weeks ago)
Author:
davidb
Message:

Code tidy up; better error checking on running Java cmd; public/private key for TotalVirus? added; sleep added if public key

Location:
gs2-extensions/malware-checker/trunk/perllib/plugins
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/malware-checker/trunk/perllib/plugins/MalwareCheckerConverter.pm

    r33676 r33717  
    179179 
    180180 
    181 sub convert { 
    182     my $self = shift(@_); 
    183     my ($source_file_full_path, $target_file_type) = @_; 
    184  
    185     return 0 unless $malwarechecker_conversion_available; 
    186     # check the filename 
    187     return 0 if ( !-f $source_file_full_path); 
    188  
    189     # Although PDFBoxConverter inherits from AutoLoadConverters and therefore 
    190     # doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options 
    191     # in case in future PDFBoxConverter no longer inherits from AutoLoadConverters 
    192     # and ends up going through gsConvert.pl 
    193     $self->{'convert_options'} .= " -pdf_tool pdfbox"; 
    194  
    195     my $img_output_mode = 0; 
    196      
    197     my $convert_to = $self->{'convert_to'}; 
    198     my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0; 
    199      
    200     # the following line is necessary to avoid 'uninitialised variable' error 
    201     # messages concerning the converted_to member variable when PDFPlugin's  
    202     # use_sections option is checked. 
    203     # PDFBox plugin now processes use_sections option, when working with v1.5.0 
    204     # of the PDFBox jar file (which embeds each page in special <div> tags). 
    205     if ($target_file_type eq "html") { 
    206     $self->{'converted_to'} = "HTML"; 
    207     } elsif ($target_file_type eq "jpg" || $target_file_type eq "png") { # || $target_file_type eq "gif" 
    208     # GIF not supported by PDFBox at present, see https://pdfbox.apache.org/1.8/commandline.html#pdftoimage 
    209     $self->{'converted_to'} = $target_file_type;     
    210     $img_output_mode = 1; 
    211     } else { 
    212     $self->{'converted_to'} = "text"; 
    213     } 
    214  
    215     my $outhandle = $self->{'outhandle'}; 
    216     my $verbosity = $self->{'verbosity'}; 
    217  
    218     my $source_file_no_path = &File::Basename::basename($source_file_full_path); 
    219     # Determine the full name and path of the output file 
    220     my $target_file_path; 
    221     if ($self->{'enable_cache'}) { 
    222     $self->init_cache_for_file($source_file_full_path); 
    223     my $cache_dir = $self->{'cached_dir'}; 
    224     my $file_root = $self->{'cached_file_root'}; 
    225     #$file_root .= "_$convert_id" if ($convert_id ne ""); 
    226  
    227     # append the output filetype suffix only for non-image output formats, since for 
    228     # images we can be outputting multiple image files per single PDF input file 
    229     my $target_file = ($img_output_mode || $paged_txt_output_mode) ? "$file_root" : "$file_root.$target_file_type"; 
    230  
    231     $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file); 
     181sub checker { 
     182    my $self = shift(@_); 
     183    my ($doc_obj, $source_file_full_path) = @_; 
     184 
     185    print STDERR "***** @@@@@@ !!!!!!! MalwareCheckerConvert::convert called!\n"; 
     186 
     187 
     188    print STDERR "**** want to check: $source_file_full_path\n"; 
     189 
     190    my $virustotal_apikey = $self->{'virustotal_apikey'}; 
     191   
     192    # Build up command along the lines: 
     193    #   java -cp $ENV{'GEXT_MALWARECHECK'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile 
     194 
     195    my $jar_file = &FileUtils::filenameConcatenate($ENV{'GEXT_MALWARECHECKER'},"lib","java","malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar"); 
     196         
     197    my $checker_cmd = "java -cp $jar_file org.greenstone.virustotal.ScanFile $virustotal_apikey $source_file_full_path"; 
     198     
     199    my $print_info = { 'message_prefix' => "MalwareChecker Conversion", 
     200               'message' => "Checking $source_file_full_path" }; 
     201    # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne ""); 
     202 
     203    #my $target_file_path = ".virustotal-resourceid-" + $source_file_full_path; 
     204    my $target_file_path = $source_file_full_path . "-VIRUSTOTAL-RESOURCEID"; 
     205     
     206    my ($regenerated,$result,$had_error)  
     207    = $self->autorun_general_cmd($checker_cmd,$source_file_full_path,$target_file_path,$print_info); 
     208 
     209    if ($had_error) { 
     210    print STDERR "Error: Failed to run cmd: $checker_cmd\n"; 
    232211    } 
    233212    else { 
    234     # this is in gsdl/tmp. get a tmp filename in collection instead??? 
    235     $target_file_path = &util::get_tmp_filename($target_file_type); 
    236  
    237     # for image files, remove the suffix, since we can have many output image files 
    238     # per input PDF (one img for each page of the PDF, for example) 
    239     if($img_output_mode || $paged_txt_output_mode) { 
    240         $target_file_path =~ s/\.[^.]*$//g; 
    241         if(!&FileUtils::directoryExists($target_file_path)) {        
    242         mkdir($target_file_path); 
     213    chomp($result); 
     214    my $top_section = $doc_obj->get_top_section(); 
     215    $doc_obj->add_utf8_metadata($top_section, "VirusTotalResourceID", $result); 
     216    } 
     217 
     218    # Need to regulate how often we make calls to VirusTotal API 
     219    # If public key, then 4 per min 
     220 
     221    if ($self->{'virustotal_keytype'} eq "public") { 
     222### if ($regenerated) { 
     223        my $verbosity = $self->{'verbosity'}; 
     224        if ($verbosity >= 1) { 
     225        my $outhandle = $self->{'outhandle'}; 
     226        print $outhandle "Sleeping for 15 secs, in accordance with public key VirusTotal API Terms and Conditions\n"; 
    243227        } 
    244          
    245         # once the item file for the imgs has been created, need to adjust target_file_path 
    246  
    247         # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the 
    248         # item file generated in it can be deleted in one go on clean_up 
    249     } 
    250      
    251     push(@{$self->{'pbtmp_file_paths'}}, $target_file_path); 
    252     } 
    253  
    254     # Generate and run the convert command 
    255     my $convert_cmd = ""; 
    256  
    257     # want the filename without extension, because any images  
    258     # are to be generated with the same filename as the PDF 
    259     my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$"); 
    260  
    261     if($img_output_mode || $paged_txt_output_mode) { # converting each page to image and/or text 
    262     my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname); 
    263  
    264     # Our custom class does renaming of the pages (simplified to just numbers) for PagedImagePlugin 
    265     #$convert_cmd = $paged_txt_output_mode ? $self->{'malwarechecker_imgtxt_launch_cmd'} : $self->{'malwarechecker_img_launch_cmd'}; 
    266     $convert_cmd = $self->{'malwarechecker_imgtxt_launch_cmd'}; 
    267     $convert_cmd .= " -imagesOnly" unless($paged_txt_output_mode); # set to images only unless there's text too 
    268     if($img_output_mode) { # whether images-only or images-and-text mode 
    269         $convert_cmd .= " -imageType $target_file_type"; 
    270         $convert_cmd .= " -dpi ". $self->{"dpi"} if defined $self->{"dpi"}; 
    271     } else { # img_output_mode off, so paged txt only and no images 
    272         $convert_cmd .= " -textOnly";        
    273     } 
    274     $convert_cmd .= " -outputPrefix \"$output_prefix\""; 
    275     $convert_cmd .= " \"$source_file_full_path\""; 
    276      
    277     } else { # single stream of text or html 
    278      
    279     if ($target_file_type eq "html") { 
    280         $convert_cmd = $self->{'malwarechecker_html_launch_cmd'}; 
    281         $convert_cmd .= " -html" if ($target_file_type eq "html"); 
    282     } else { 
    283         $convert_cmd = $self->{'malwarechecker_txt_launch_cmd'}; 
    284     } 
    285     $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\""; 
    286     } 
    287  
    288     if ($verbosity>2) { 
    289     &gsprintf($outhandle,"Convert command: $convert_cmd\n"); 
    290     } 
    291  
    292     my $print_info = { 'message_prefix' => "PDFBox Conversion", 
    293                'message' => "Converting $source_file_no_path to: $target_file_type" }; 
    294     # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne ""); 
    295  
    296     my ($regenerated,$result,$had_error)  
    297     = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info); 
    298  
    299     if($img_output_mode || $paged_txt_output_mode) { 
    300     # now the images have been generated, generate the "$target_file_path/tailname.item"  
    301     # item file for them, which is also the target_file_path that needs to be returned 
    302     $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type); 
    303     #print STDERR "**** item file: $target_file_path\n"; 
    304     } 
    305     elsif ($self->{'converted_to'} eq "text") { 
    306     # ensure html entities are doubly escaped for pdfbox to text conversion: &amp; -> &amp;amp; 
    307     # conversion to html does it automatically, but conversion to text doesn't 
    308     # and this results in illegal characters in doc.xml 
    309  
    310     my $fulltext = &FileUtils::readUTF8File($target_file_path); 
    311     if(defined $fulltext) { 
    312         #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help 
    313         $fulltext =~ s@&@&amp;@sg; # Kathy's fix to ensure doc contents don't break XML 
    314         &FileUtils::writeUTF8File($target_file_path, \$fulltext); 
    315     } else { 
    316         print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n"; 
    317         $had_error = 1; 
    318     } 
    319     } 
    320  
    321     if ($had_error) { 
    322     return (0, $result,$target_file_path); 
    323     } 
    324     return (1, $result,$target_file_path); 
    325 } 
    326  
    327 sub convert_without_result { 
    328     my $self = shift(@_); 
    329  
    330     my $source_file_path = shift(@_); 
    331     my $target_file_type = shift(@_); 
    332     my $convert_options  = shift(@_) || ""; 
    333     my $convert_id       = shift(@_) || ""; 
    334  
    335     return $self->convert($source_file_path,$target_file_type, 
    336               $convert_options,$convert_id,"without_result"); 
     228        sleep(15); 
     229### } 
     230    }        
    337231} 
    338232 
  • gs2-extensions/malware-checker/trunk/perllib/plugins/PDFv3Plugin.pm

    r33676 r33717  
    112112     ]; 
    113113 
     114my $virustotal_keytype_list = 
     115    [ { 'name' => "public", 
     116    'desc' => "{PDFv3Plugin.virustotal_type_list.public}" }, 
     117      { 'name' => "private", 
     118    'desc' => "{PDFv3Plugin.virustotal_type_list.private}" } 
     119    ]; 
     120 
     121       
    114122my $opt_malwarechecker_args = [ { 'name' => "malware_checker", 
    115123                  'desc' => "{MalwareChecker.malware_checker}", 
    116124                  'type' => "flag",                
    117                   'reqd' => "no" } ]; 
    118  
    119  
    120 my $options = { 'name'     => "PDFv2Plugin", 
     125                  'reqd' => "no" }, 
     126                { 'name' => "virustotal_keytype", 
     127                  'desc' => "{PDFv3Plugin.virustotal_keytype}", 
     128                  'type' => "enum", 
     129                  'list' => $virustotal_keytype_list,                  
     130                  'deft' => "public" }, 
     131                { 'name' => "virustotal_apikey", 
     132                  'desc' => "{PDFv3Plugin.virustotal_apikey}", 
     133                  'type' => "string", 
     134                  'deft' => "CHANGEME" } 
     135    ]; 
     136 
     137 
     138my $options = { 'name'     => "PDFv3Plugin", 
    121139        'desc'     => "{PDFv2Plugin.desc}", 
    122140        'abstract' => "no", 
     
    756774    print STDERR "\n\n\n\n\n***** PDFv3Plugin::process()\n"; 
    757775    if ($self->{'malware_checker'}) { 
     776 
    758777    my $filename = &FileUtils::filenameConcatenate($base_dir, $file); 
    759     print STDERR "**** want to check: $filename\n"; 
    760  
    761     # java -cp $ENV{'GEXT_MALWARECHECK'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile 
    762  
    763     my $cmd = "java -cp $ENV{'GEXT_MALWARECHECKER'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile $filename"; 
    764     my $status = system($cmd); 
    765     print "STATUS = $status\n"; 
     778 
     779    $self->MalwareCheckerConverter::checker($doc_obj,$filename); 
    766780    } 
    767781