- Timestamp:
- 2017-06-28T20:39:16+12:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak
r31745 r31757 45 45 # At present, a file or folder of files is assumed. 46 46 # Need to look in there for files with extension process_ext. 47 # Support html_multi as output? Then a folder of html files is generated per document? OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version. 47 48 48 49 sub BEGIN { … … 78 79 'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}", 79 80 'type' => "string", 80 'reqd' => " yes",81 'reqd' => "no", 81 82 'deft' => "" } ]; 82 83 … … 96 97 push(@{$hashArgOptLists->{"OptList"}},$options); 97 98 99 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists); 98 100 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 99 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists); 100 my $self = BaseImporter::merge_inheritance($cbf_self, $unknown_converter_self); 101 102 # Need to feed the superclass plugins to merge_inheritance() below in the order that the 103 # superclass plugins were declared in the ISA listing earlier in this file: 104 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self); 101 105 102 106 $self = bless $self, $class; 103 107 104 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at start of this file??????? 108 my $outhandle = $self->{'outhandle'}; 109 print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n"; 110 if(!defined $self->{'convert_to'}) { 111 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file??????? 112 } 105 113 106 114 # Convert_To set up, including secondary_plugins for processing the text or html generated … … 173 181 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter 174 182 183 #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file() 184 my $outhandle = $self->{'outhandle'}; 185 my $convert_to = $self->{'convert_to'}; 186 my $failhandle = $self->{'failhandle'}; 187 my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings() 188 189 190 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename); 191 192 # derive tmp filename from input filename 193 my ($tailname, $dirname, $suffix) 194 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$"); 195 196 # softlink to collection tmp dir 197 my $tmp_dirname = &util::get_timestamped_tmp_folder(); 198 if (defined $tmp_dirname) { 199 $self->{'tmp_dir'} = $tmp_dirname; 200 } else { 201 $tmp_dirname = $dirname; 202 } 203 204 # # convert to utf-8 otherwise we have problems with the doc.xml file later on 205 # my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname); 206 207 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file 208 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname); 209 210 211 # URLEncode this since htmls with images where the html filename is utf8 don't seem 212 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded 213 # files on the filesystem. 214 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix"); 215 216 my $lc_suffix = lc($suffix); 217 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix"); 218 219 # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix 220 # But we can't softlink to relative paths. Therefore, we need to ensure that 221 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html 222 my $ensure_path_absolute = 1; # true 223 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute); 224 my $verbosity = $self->{'verbosity'}; 225 if ($verbosity > 0) { 226 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n"; 227 } 228 229 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log"); 230 231 232 my $output_type=$self->{'convert_to'}; 233 234 # store the *actual* output type and return the output filename 235 # it's possible we requested conversion to html, but only to text succeeded 236 #$self->{'convert_to_ext'} = $output_type; 237 if ($output_type =~ /html/i) { 238 $self->{'converted_to'} = "HTML"; 239 } elsif ($output_type =~ /te?xt/i) { 240 $self->{'converted_to'} = "Text"; 241 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){ 242 $self->{'converted_to'} = "PagedImage"; 243 } 244 245 my $output_filename = $tmp_filename; 246 my $output_dirname; 247 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) { 248 # running under windows 249 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 250 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname; 251 } else { 252 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname; 253 } 254 $output_filename .= ".item"; 255 } else { 256 $output_filename =~ s/$lc_suffix$/.$output_type/; 257 } 258 259 #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file() 260 261 # Execute the conversion command and get the type of the result, 262 # making sure the converter gives us the appropriate output type 263 175 264 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1. 176 265 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed. … … 178 267 # should produce either a text file or output to stdout. 179 268 180 my $outhandle=$self->{'outhandle'};181 182 269 my $cmd = $self->{'exec_cmd'}; 183 270 if(!$cmd) { # empty string for instance 184 print $outhandle "$plugin_name Conversion error: invalid cmd $cmd\n";271 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n"; 185 272 return ""; 186 273 } 187 274 188 # replace occurrences of '*' placeholder in cmd string with input filename 189 my ($tailname, $dir, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 190 $cmd =~ s/\*/$tailname/g; 191 print STDERR "@@@@ $plugin_name: executing conversion cmd $cmd\n"; 275 # HARDCODING CMD FOR NOW 276 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\""; 277 278 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html INPUT_FILE OUTPUT"; 279 280 # replace occurrences of placeholders in cmd string 281 #$cmd =~ s@\"@\\"@g; 282 $cmd =~ s@INPUT_FILE@\"$input_filename\"@g; 283 if(defined $output_dirname) { 284 $cmd =~ s@OUTPUT@\"$output_dirname\"@g; 285 } else { 286 $cmd =~ s@OUTPUT@\"$output_filename\"@g; 287 } 288 289 print STDERR "@@@@ $plugin_name: executing conversion cmd \n|$cmd|\n"; 290 print STDERR " on infile |$input_filename|\n"; 291 print STDERR " to produce expected $output_filename\n"; 192 292 my $status = system($cmd); 193 293 … … 202 302 } 203 303 204 my $output_file_or_dir = $self->{'output_file_or_dir_name'}; 205 if (!-e $output_file_or_dir) { 206 print $outhandle "$plugin_name Conversion error: Output file/dir $output_file_or_dir doesn't exist\n"; 304 # remove symbolic link to original file 305 &FileUtils::removeFiles($tmp_filename); 306 307 308 if(defined $output_dirname && -d $output_dirname) { 309 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n"; 207 310 return ""; 208 311 } 312 elsif (!-e $output_filename) { 313 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n"; 314 return ""; 315 } 209 316 210 317 # else, conversion success 211 318 212 319 # if multiple images were generated by running the conversion 213 if ($self->{'convert_to'} eq "pagedimg") { 214 my $item_filename = $self->generate_item_file($output_file_or_dir); 215 return $item_filename; 216 } 217 218 return $output_file_or_dir; 320 if ($self->{'convert_to'} =~ /^pagedimg/) { 321 my $item_filename = $self->generate_item_file($output_filename); #my $item_filename = $self->generate_item_file($output_file_or_dir); 322 323 if (!-e $item_filename) { 324 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n"; 325 return ""; 326 } 327 $output_filename = $item_filename; 328 } 329 330 $self->{'output_dirname'} = $output_dirname; 331 $self->{'output_filename'} = $output_filename; 332 333 return $output_filename; #$output_file_or_dir; 219 334 } 220 335 … … 230 345 return undef unless $self->can_process_this_file($filename_full_path); 231 346 232 my $output_file_or_dir = $self->{'output_file_or_dir_name'}; 233 my $is_output_dir = (-d $output_file_or_dir) ? 1 : 0; 347 my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0; 234 348 235 349 # we are only doing something special if we have a directory of html files 236 if (!$is_output_dir || $self->{'convert_to'} ne "html") { 350 #if ($is_output_dir || $self->{'convert_to'} ne "html") { 351 if ($self->{'convert_to'} ne "html_multi") { 237 352 return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm 238 353 } … … 320 435 # deleted some commented out code here that exists in PowerPointPlugin 321 436 322 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created 323 # as we don't know where it was created 324 #$self->clean_up_after_doc_obj_processing();437 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created? 438 # as we don't know where it was created. No. Now creating in tmp. 439 $self->clean_up_after_doc_obj_processing(); 325 440 326 441 … … 333 448 sub read_into_doc_obj { 334 449 my $self = shift (@_); 335 $self->ConvertBinaryFile:: deinit(@_);450 $self->ConvertBinaryFile::read_into_doc_obj(@_); 336 451 } 337 452
Note:
See TracChangeset
for help on using the changeset viewer.