Changeset 33717
- Timestamp:
- 2019-11-22T10:44:13+13:00 (4 years ago)
- Location:
- gs2-extensions/malware-checker/trunk/perllib/plugins
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/malware-checker/trunk/perllib/plugins/MalwareCheckerConverter.pm
r33676 r33717 179 179 180 180 181 sub convert { 182 my $self = shift(@_); 183 my ($source_file_full_path, $target_file_type) = @_; 184 185 return 0 unless $malwarechecker_conversion_available; 186 # check the filename 187 return 0 if ( !-f $source_file_full_path); 188 189 # Although PDFBoxConverter inherits from AutoLoadConverters and therefore 190 # doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options 191 # in case in future PDFBoxConverter no longer inherits from AutoLoadConverters 192 # and ends up going through gsConvert.pl 193 $self->{'convert_options'} .= " -pdf_tool pdfbox"; 194 195 my $img_output_mode = 0; 196 197 my $convert_to = $self->{'convert_to'}; 198 my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0; 199 200 # the following line is necessary to avoid 'uninitialised variable' error 201 # messages concerning the converted_to member variable when PDFPlugin's 202 # use_sections option is checked. 203 # PDFBox plugin now processes use_sections option, when working with v1.5.0 204 # of the PDFBox jar file (which embeds each page in special <div> tags). 205 if ($target_file_type eq "html") { 206 $self->{'converted_to'} = "HTML"; 207 } elsif ($target_file_type eq "jpg" || $target_file_type eq "png") { # || $target_file_type eq "gif" 208 # GIF not supported by PDFBox at present, see https://pdfbox.apache.org/1.8/commandline.html#pdftoimage 209 $self->{'converted_to'} = $target_file_type; 210 $img_output_mode = 1; 211 } else { 212 $self->{'converted_to'} = "text"; 213 } 214 215 my $outhandle = $self->{'outhandle'}; 216 my $verbosity = $self->{'verbosity'}; 217 218 my $source_file_no_path = &File::Basename::basename($source_file_full_path); 219 # Determine the full name and path of the output file 220 my $target_file_path; 221 if ($self->{'enable_cache'}) { 222 $self->init_cache_for_file($source_file_full_path); 223 my $cache_dir = $self->{'cached_dir'}; 224 my $file_root = $self->{'cached_file_root'}; 225 #$file_root .= "_$convert_id" if ($convert_id ne ""); 226 227 # append the output filetype suffix only for non-image output formats, since for 228 # images we can be outputting multiple image files per single PDF input file 229 my $target_file = ($img_output_mode || $paged_txt_output_mode) ? "$file_root" : "$file_root.$target_file_type"; 230 231 $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file); 181 sub checker { 182 my $self = shift(@_); 183 my ($doc_obj, $source_file_full_path) = @_; 184 185 print STDERR "***** @@@@@@ !!!!!!! MalwareCheckerConvert::convert called!\n"; 186 187 188 print STDERR "**** want to check: $source_file_full_path\n"; 189 190 my $virustotal_apikey = $self->{'virustotal_apikey'}; 191 192 # Build up command along the lines: 193 # java -cp $ENV{'GEXT_MALWARECHECK'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile 194 195 my $jar_file = &FileUtils::filenameConcatenate($ENV{'GEXT_MALWARECHECKER'},"lib","java","malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar"); 196 197 my $checker_cmd = "java -cp $jar_file org.greenstone.virustotal.ScanFile $virustotal_apikey $source_file_full_path"; 198 199 my $print_info = { 'message_prefix' => "MalwareChecker Conversion", 200 'message' => "Checking $source_file_full_path" }; 201 # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne ""); 202 203 #my $target_file_path = ".virustotal-resourceid-" + $source_file_full_path; 204 my $target_file_path = $source_file_full_path . "-VIRUSTOTAL-RESOURCEID"; 205 206 my ($regenerated,$result,$had_error) 207 = $self->autorun_general_cmd($checker_cmd,$source_file_full_path,$target_file_path,$print_info); 208 209 if ($had_error) { 210 print STDERR "Error: Failed to run cmd: $checker_cmd\n"; 232 211 } 233 212 else { 234 # this is in gsdl/tmp. get a tmp filename in collection instead??? 235 $target_file_path = &util::get_tmp_filename($target_file_type); 236 237 # for image files, remove the suffix, since we can have many output image files 238 # per input PDF (one img for each page of the PDF, for example) 239 if($img_output_mode || $paged_txt_output_mode) { 240 $target_file_path =~ s/\.[^.]*$//g; 241 if(!&FileUtils::directoryExists($target_file_path)) { 242 mkdir($target_file_path); 213 chomp($result); 214 my $top_section = $doc_obj->get_top_section(); 215 $doc_obj->add_utf8_metadata($top_section, "VirusTotalResourceID", $result); 216 } 217 218 # Need to regulate how often we make calls to VirusTotal API 219 # If public key, then 4 per min 220 221 if ($self->{'virustotal_keytype'} eq "public") { 222 ### if ($regenerated) { 223 my $verbosity = $self->{'verbosity'}; 224 if ($verbosity >= 1) { 225 my $outhandle = $self->{'outhandle'}; 226 print $outhandle "Sleeping for 15 secs, in accordance with public key VirusTotal API Terms and Conditions\n"; 243 227 } 244 245 # once the item file for the imgs has been created, need to adjust target_file_path 246 247 # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the 248 # item file generated in it can be deleted in one go on clean_up 249 } 250 251 push(@{$self->{'pbtmp_file_paths'}}, $target_file_path); 252 } 253 254 # Generate and run the convert command 255 my $convert_cmd = ""; 256 257 # want the filename without extension, because any images 258 # are to be generated with the same filename as the PDF 259 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$"); 260 261 if($img_output_mode || $paged_txt_output_mode) { # converting each page to image and/or text 262 my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname); 263 264 # Our custom class does renaming of the pages (simplified to just numbers) for PagedImagePlugin 265 #$convert_cmd = $paged_txt_output_mode ? $self->{'malwarechecker_imgtxt_launch_cmd'} : $self->{'malwarechecker_img_launch_cmd'}; 266 $convert_cmd = $self->{'malwarechecker_imgtxt_launch_cmd'}; 267 $convert_cmd .= " -imagesOnly" unless($paged_txt_output_mode); # set to images only unless there's text too 268 if($img_output_mode) { # whether images-only or images-and-text mode 269 $convert_cmd .= " -imageType $target_file_type"; 270 $convert_cmd .= " -dpi ". $self->{"dpi"} if defined $self->{"dpi"}; 271 } else { # img_output_mode off, so paged txt only and no images 272 $convert_cmd .= " -textOnly"; 273 } 274 $convert_cmd .= " -outputPrefix \"$output_prefix\""; 275 $convert_cmd .= " \"$source_file_full_path\""; 276 277 } else { # single stream of text or html 278 279 if ($target_file_type eq "html") { 280 $convert_cmd = $self->{'malwarechecker_html_launch_cmd'}; 281 $convert_cmd .= " -html" if ($target_file_type eq "html"); 282 } else { 283 $convert_cmd = $self->{'malwarechecker_txt_launch_cmd'}; 284 } 285 $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\""; 286 } 287 288 if ($verbosity>2) { 289 &gsprintf($outhandle,"Convert command: $convert_cmd\n"); 290 } 291 292 my $print_info = { 'message_prefix' => "PDFBox Conversion", 293 'message' => "Converting $source_file_no_path to: $target_file_type" }; 294 # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne ""); 295 296 my ($regenerated,$result,$had_error) 297 = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info); 298 299 if($img_output_mode || $paged_txt_output_mode) { 300 # now the images have been generated, generate the "$target_file_path/tailname.item" 301 # item file for them, which is also the target_file_path that needs to be returned 302 $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type); 303 #print STDERR "**** item file: $target_file_path\n"; 304 } 305 elsif ($self->{'converted_to'} eq "text") { 306 # ensure html entities are doubly escaped for pdfbox to text conversion: & -> &amp; 307 # conversion to html does it automatically, but conversion to text doesn't 308 # and this results in illegal characters in doc.xml 309 310 my $fulltext = &FileUtils::readUTF8File($target_file_path); 311 if(defined $fulltext) { 312 #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help 313 $fulltext =~ s@&@&@sg; # Kathy's fix to ensure doc contents don't break XML 314 &FileUtils::writeUTF8File($target_file_path, \$fulltext); 315 } else { 316 print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n"; 317 $had_error = 1; 318 } 319 } 320 321 if ($had_error) { 322 return (0, $result,$target_file_path); 323 } 324 return (1, $result,$target_file_path); 325 } 326 327 sub convert_without_result { 328 my $self = shift(@_); 329 330 my $source_file_path = shift(@_); 331 my $target_file_type = shift(@_); 332 my $convert_options = shift(@_) || ""; 333 my $convert_id = shift(@_) || ""; 334 335 return $self->convert($source_file_path,$target_file_type, 336 $convert_options,$convert_id,"without_result"); 228 sleep(15); 229 ### } 230 } 337 231 } 338 232 -
gs2-extensions/malware-checker/trunk/perllib/plugins/PDFv3Plugin.pm
r33676 r33717 112 112 ]; 113 113 114 my $virustotal_keytype_list = 115 [ { 'name' => "public", 116 'desc' => "{PDFv3Plugin.virustotal_type_list.public}" }, 117 { 'name' => "private", 118 'desc' => "{PDFv3Plugin.virustotal_type_list.private}" } 119 ]; 120 121 114 122 my $opt_malwarechecker_args = [ { 'name' => "malware_checker", 115 123 'desc' => "{MalwareChecker.malware_checker}", 116 124 'type' => "flag", 117 'reqd' => "no" } ]; 118 119 120 my $options = { 'name' => "PDFv2Plugin", 125 'reqd' => "no" }, 126 { 'name' => "virustotal_keytype", 127 'desc' => "{PDFv3Plugin.virustotal_keytype}", 128 'type' => "enum", 129 'list' => $virustotal_keytype_list, 130 'deft' => "public" }, 131 { 'name' => "virustotal_apikey", 132 'desc' => "{PDFv3Plugin.virustotal_apikey}", 133 'type' => "string", 134 'deft' => "CHANGEME" } 135 ]; 136 137 138 my $options = { 'name' => "PDFv3Plugin", 121 139 'desc' => "{PDFv2Plugin.desc}", 122 140 'abstract' => "no", … … 756 774 print STDERR "\n\n\n\n\n***** PDFv3Plugin::process()\n"; 757 775 if ($self->{'malware_checker'}) { 776 758 777 my $filename = &FileUtils::filenameConcatenate($base_dir, $file); 759 print STDERR "**** want to check: $filename\n"; 760 761 # java -cp $ENV{'GEXT_MALWARECHECK'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile 762 763 my $cmd = "java -cp $ENV{'GEXT_MALWARECHECKER'}/lib/java/malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar org.greenstone.virustotal.ScanFile $filename"; 764 my $status = system($cmd); 765 print "STATUS = $status\n"; 778 779 $self->MalwareCheckerConverter::checker($doc_obj,$filename); 766 780 } 767 781
Note:
See TracChangeset
for help on using the changeset viewer.