Changeset 31757
- Timestamp:
- 2017-06-28T20:39:16+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm.bak
r31745 r31757 45 45 # At present, a file or folder of files is assumed. 46 46 # Need to look in there for files with extension process_ext. 47 # Support html_multi as output? Then a folder of html files is generated per document? OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version. 47 48 48 49 sub BEGIN { … … 78 79 'desc' => "{UnknownConverterPlugin.output_file_or_dir_name}", 79 80 'type' => "string", 80 'reqd' => " yes",81 'reqd' => "no", 81 82 'deft' => "" } ]; 82 83 … … 96 97 push(@{$hashArgOptLists->{"OptList"}},$options); 97 98 99 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists); 98 100 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 99 my $unknown_converter_self = new UnknownPlugin($pluginlist, $inputargs, $hashArgOptLists); 100 my $self = BaseImporter::merge_inheritance($cbf_self, $unknown_converter_self); 101 102 # Need to feed the superclass plugins to merge_inheritance() below in the order that the 103 # superclass plugins were declared in the ISA listing earlier in this file: 104 my $self = BaseImporter::merge_inheritance($unknown_converter_self, $cbf_self); 101 105 102 106 $self = bless $self, $class; 103 107 104 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at start of this file??????? 108 my $outhandle = $self->{'outhandle'}; 109 print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n"; 110 if(!defined $self->{'convert_to'}) { 111 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file??????? 112 } 105 113 106 114 # Convert_To set up, including secondary_plugins for processing the text or html generated … … 173 181 my $plugin_name = $self->{'plugin_type'}; # inherited from BaseImporter 174 182 183 #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file() 184 my $outhandle = $self->{'outhandle'}; 185 my $convert_to = $self->{'convert_to'}; 186 my $failhandle = $self->{'failhandle'}; 187 my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings() 188 189 190 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename); 191 192 # derive tmp filename from input filename 193 my ($tailname, $dirname, $suffix) 194 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$"); 195 196 # softlink to collection tmp dir 197 my $tmp_dirname = &util::get_timestamped_tmp_folder(); 198 if (defined $tmp_dirname) { 199 $self->{'tmp_dir'} = $tmp_dirname; 200 } else { 201 $tmp_dirname = $dirname; 202 } 203 204 # # convert to utf-8 otherwise we have problems with the doc.xml file later on 205 # my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname); 206 207 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file 208 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname); 209 210 211 # URLEncode this since htmls with images where the html filename is utf8 don't seem 212 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded 213 # files on the filesystem. 214 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix"); 215 216 my $lc_suffix = lc($suffix); 217 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix"); 218 219 # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix 220 # But we can't softlink to relative paths. Therefore, we need to ensure that 221 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html 222 my $ensure_path_absolute = 1; # true 223 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute); 224 my $verbosity = $self->{'verbosity'}; 225 if ($verbosity > 0) { 226 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n"; 227 } 228 229 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log"); 230 231 232 my $output_type=$self->{'convert_to'}; 233 234 # store the *actual* output type and return the output filename 235 # it's possible we requested conversion to html, but only to text succeeded 236 #$self->{'convert_to_ext'} = $output_type; 237 if ($output_type =~ /html/i) { 238 $self->{'converted_to'} = "HTML"; 239 } elsif ($output_type =~ /te?xt/i) { 240 $self->{'converted_to'} = "Text"; 241 } elsif ($output_type =~ /item/i || $output_type =~ /^pagedimg/){ 242 $self->{'converted_to'} = "PagedImage"; 243 } 244 245 my $output_filename = $tmp_filename; 246 my $output_dirname; 247 if ($output_type =~ /item/i || $output_type =~ /^pagedimg/) { 248 # running under windows 249 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 250 $output_dirname = $tmp_dirname . "\\$utf8_tailname\\" . $utf8_tailname; 251 } else { 252 $output_dirname = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname; 253 } 254 $output_filename .= ".item"; 255 } else { 256 $output_filename =~ s/$lc_suffix$/.$output_type/; 257 } 258 259 #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file() 260 261 # Execute the conversion command and get the type of the result, 262 # making sure the converter gives us the appropriate output type 263 175 264 # On Linux: if the program isn't installed, $? tends to come back with 127, in any case neither 0 nor 1. 176 265 # On Windows: echo %ERRORLEVEL% ends up as 9009 if the program is not installed. … … 178 267 # should produce either a text file or output to stdout. 179 268 180 my $outhandle=$self->{'outhandle'};181 182 269 my $cmd = $self->{'exec_cmd'}; 183 270 if(!$cmd) { # empty string for instance 184 print $outhandle "$plugin_name Conversion error: invalid cmd $cmd\n";271 print $outhandle "$plugin_name Conversion error: a command to execute is required, cmd provided is |$cmd|\n"; 185 272 return ""; 186 273 } 187 274 188 # replace occurrences of '*' placeholder in cmd string with input filename 189 my ($tailname, $dir, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 190 $cmd =~ s/\*/$tailname/g; 191 print STDERR "@@@@ $plugin_name: executing conversion cmd $cmd\n"; 275 # HARDCODING CMD FOR NOW 276 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\""; 277 278 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html INPUT_FILE OUTPUT"; 279 280 # replace occurrences of placeholders in cmd string 281 #$cmd =~ s@\"@\\"@g; 282 $cmd =~ s@INPUT_FILE@\"$input_filename\"@g; 283 if(defined $output_dirname) { 284 $cmd =~ s@OUTPUT@\"$output_dirname\"@g; 285 } else { 286 $cmd =~ s@OUTPUT@\"$output_filename\"@g; 287 } 288 289 print STDERR "@@@@ $plugin_name: executing conversion cmd \n|$cmd|\n"; 290 print STDERR " on infile |$input_filename|\n"; 291 print STDERR " to produce expected $output_filename\n"; 192 292 my $status = system($cmd); 193 293 … … 202 302 } 203 303 204 my $output_file_or_dir = $self->{'output_file_or_dir_name'}; 205 if (!-e $output_file_or_dir) { 206 print $outhandle "$plugin_name Conversion error: Output file/dir $output_file_or_dir doesn't exist\n"; 304 # remove symbolic link to original file 305 &FileUtils::removeFiles($tmp_filename); 306 307 308 if(defined $output_dirname && -d $output_dirname) { 309 print $outhandle "$plugin_name Conversion error: Output directory $output_dirname doesn't exist\n"; 207 310 return ""; 208 311 } 312 elsif (!-e $output_filename) { 313 print $outhandle "$plugin_name Conversion error: Output file $output_filename doesn't exist\n"; 314 return ""; 315 } 209 316 210 317 # else, conversion success 211 318 212 319 # if multiple images were generated by running the conversion 213 if ($self->{'convert_to'} eq "pagedimg") { 214 my $item_filename = $self->generate_item_file($output_file_or_dir); 215 return $item_filename; 216 } 217 218 return $output_file_or_dir; 320 if ($self->{'convert_to'} =~ /^pagedimg/) { 321 my $item_filename = $self->generate_item_file($output_filename); #my $item_filename = $self->generate_item_file($output_file_or_dir); 322 323 if (!-e $item_filename) { 324 print $outhandle "$plugin_name Conversion error: Item file $item_filename was not generated\n"; 325 return ""; 326 } 327 $output_filename = $item_filename; 328 } 329 330 $self->{'output_dirname'} = $output_dirname; 331 $self->{'output_filename'} = $output_filename; 332 333 return $output_filename; #$output_file_or_dir; 219 334 } 220 335 … … 230 345 return undef unless $self->can_process_this_file($filename_full_path); 231 346 232 my $output_file_or_dir = $self->{'output_file_or_dir_name'}; 233 my $is_output_dir = (-d $output_file_or_dir) ? 1 : 0; 347 my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0; 234 348 235 349 # we are only doing something special if we have a directory of html files 236 if (!$is_output_dir || $self->{'convert_to'} ne "html") { 350 #if ($is_output_dir || $self->{'convert_to'} ne "html") { 351 if ($self->{'convert_to'} ne "html_multi") { 237 352 return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm 238 353 } … … 320 435 # deleted some commented out code here that exists in PowerPointPlugin 321 436 322 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created 323 # as we don't know where it was created 324 #$self->clean_up_after_doc_obj_processing();437 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created? 438 # as we don't know where it was created. No. Now creating in tmp. 439 $self->clean_up_after_doc_obj_processing(); 325 440 326 441 … … 333 448 sub read_into_doc_obj { 334 449 my $self = shift (@_); 335 $self->ConvertBinaryFile:: deinit(@_);450 $self->ConvertBinaryFile::read_into_doc_obj(@_); 336 451 } 337 452 -
main/trunk/greenstone2/perllib/strings.properties
r31754 r31757 1271 1271 TextPlugin.title_sub:Substitution expression to modify string stored as Title. Used by, for example, PostScriptPlugin to remove "Page 1" etc from text used as the title. 1272 1272 1273 UnknownConverterPlugin.desc:If you have a custom conversion tool installed that you're able to run from the command line to convert from an unsupported document format to either text or HTML, provide that command to this Plugin and it will run the command for you, capturing the output for indexing by Greenstone, making your document searchable. Use * as placeholder for input file name, but specify suffix of file to be converted (and also of any output file generated, if a file and not dir of files is generated).1274 1275 UnknownConverterPlugin.exec_cmd:Command line command string to execute that will do the conversion. 1273 UnknownConverterPlugin.desc:If you have a custom conversion tool installed that you're able to run from the command line to convert from an unsupported document format to text, HTML or a series of images in jpg, png or gif form, then provide that command to this Plugin. It will then run the command for you, capturing the output for indexing by Greenstone, making any documents that aren't converted to images searchable. Set the process_extension to the suffix of files to be converted. Set convert_to to be the output format that the conversion command will generate, which will determine the output file's suffix. Use INPUT_FILE and OUTPUT as place holders in the command, which Greenstone will replace. It will pass in the full path to each file that matches the process_extension suffix in turn as INPUT_FILE. OUTPUT will be replaced with a path in the temporary folder of the output file with suffix determined by the value of convert_to. If convert_to is a pagedimg type, Greenstone sets OUTPUT to be a directory to contain the expected files and will create an item file collating the parts of the document. 1274 1275 UnknownConverterPlugin.exec_cmd:Command line command string to execute that will do the conversion. Quoted elements need to have the quotes escaped with a backslash to preserve them. 1276 1276 1277 1277 UnknownConverterPlugin.output_file_or_dir_name: Full pathname of the output file or of the directory (of output files) that get generated by the conversion
Note:
See TracChangeset
for help on using the changeset viewer.