Changeset 31766
- Timestamp:
- 2017-06-29T19:50:02+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2/perllib/plugins
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r31761 r31766 263 263 my $ensure_path_absolute = 1; # true 264 264 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute); 265 266 my $output_filename = $self->run_conversion_command($tmp_dirname, $tmp_filename, 267 $utf8_tailname, $lc_suffix, $tailname, $suffix); 268 269 return $output_filename; 270 } 271 272 # The latter half of tmp_area_convert_file: runs the conversion command and returns the output file name 273 # Split from tmp_area_convert_file because UnknownConverterPlugin can then inherit all of 274 # tmp_area_convert_file and only needs to override this part: 275 sub run_conversion_command { 276 my $self = shift (@_); 277 my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_; 278 279 my $outhandle = $self->{'outhandle'}; 280 my $convert_to = $self->{'convert_to'}; 281 my $failhandle = $self->{'failhandle'}; 282 265 283 my $verbosity = $self->{'verbosity'}; 266 284 if ($verbosity > 0) { -
main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm
r31764 r31766 45 45 # At present, a file or folder of files is assumed. 46 46 # Need to look in there for files with extension process_ext. 47 # Do we also need a html_multi option to convert_to? Support html_multi as output? 47 # Do we also need a html_multi option to convert_to? If supporting html_multi as output, 48 # see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm 48 49 # Then a folder of html files is generated per document? 49 50 # OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version. … … 104 105 105 106 my $outhandle = $self->{'outhandle'}; 106 print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";107 107 if(!defined $self->{'convert_to'}) { 108 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file??????? 109 } 108 $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set in $convert_to_list declaration???? 109 } 110 #print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n"; 110 111 111 112 # Convert_To set up, including secondary_plugins for processing the text or html generated … … 151 152 152 153 # Are init, begin and deinit necessary (will they not get called automatically)? 154 # Dr Bainbridge says it doesn't hurt for these to be explicitly defined here. 153 155 # Copied here from PDFPlugin, PowerPointPlugin 154 156 # https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when … … 176 178 } 177 179 178 # overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm179 sub tmp_area_convert_file { 180 # should we first hardlink the output files/folder to tmp area, so we won't be working across drives? 181 182 my $self = shift (@_); 183 my ($output_ext, $input_filename, $textref) = @_;184 185 #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file()180 # Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion 181 # In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual 182 # input filename (which is the tmp_filename parameter) and the output file name, which this subroutine 183 # will work out. Then it will run the conversion command. 184 sub run_conversion_command { 185 my $self = shift (@_); 186 my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_; 187 186 188 my $outhandle = $self->{'outhandle'}; 187 189 my $convert_to = $self->{'convert_to'}; 188 190 my $failhandle = $self->{'failhandle'}; 189 my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings()190 191 192 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);193 194 # derive tmp filename from input filename195 my ($tailname, $dirname, $suffix)196 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");197 198 # softlink to collection tmp dir199 my $tmp_dirname = &util::get_timestamped_tmp_folder();200 if (defined $tmp_dirname) {201 $self->{'tmp_dir'} = $tmp_dirname;202 } else {203 $tmp_dirname = $dirname;204 }205 206 # # convert to utf-8 otherwise we have problems with the doc.xml file later on207 # my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);208 209 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file210 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);211 212 213 # URLEncode this since htmls with images where the html filename is utf8 don't seem214 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded215 # files on the filesystem.216 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");217 218 my $lc_suffix = lc($suffix);219 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");220 221 # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix222 # But we can't softlink to relative paths. Therefore, we need to ensure that223 # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html224 my $ensure_path_absolute = 1; # true225 &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);226 191 my $verbosity = $self->{'verbosity'}; 192 193 my $convert_to_ext = $self->{'convert_to_ext'}; 227 194 if ($verbosity > 0) { 228 195 print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n"; 229 196 } 230 197 231 my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log"); 232 233 198 # The command to be executed must be provided the input filename and output file/dir name 199 # input filename = tmp_filename 200 # 1. We now work out the output filename. Code for it comes from 201 # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified 202 234 203 my $output_type=$self->{'convert_to'}; 235 204 … … 259 228 } 260 229 261 #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file() 262 263 # Execute the conversion command and get the type of the result, 230 231 # 2. Execute the conversion command and get the type of the result, 264 232 # making sure the converter gives us the appropriate output type 265 233 … … 276 244 return ""; 277 245 } 278 279 # HARDCODING CMD FOR NOW280 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\"";281 282 #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html %INPUT_FILE %OUTPUT";283 246 284 247 # replace occurrences of placeholders in cmd string … … 294 257 if ($self->{'verbosity'} > 2) { 295 258 print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n"; 296 print STDERR " on infile |$ input_filename|\n";259 print STDERR " on infile |$tmp_filename|\n"; 297 260 print STDERR " to produce expected $output_filename\n"; 298 261 } … … 341 304 342 305 return $output_filename; 343 } 344 345 # Copied from PowerPointPlugin, with some modifications 346 # override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed. 347 sub read { 348 my $self = shift (@_); 349 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 350 351 # can we process this file?? 352 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 353 354 return undef unless $self->can_process_this_file($filename_full_path); 355 356 my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0; 357 358 # we are only doing something special if we have a directory of html files 359 #if ($is_output_dir || $self->{'convert_to'} ne "html") { 360 if ($self->{'convert_to'} ne "html_multi") { 361 return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm 362 } 363 my $outhandle = $self->{'outhandle'}; 364 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 365 print $outhandle "$self->{'plugin_type'} processing $file\n" 366 if $self->{'verbosity'} > 1; 367 368 my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version 369 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline 370 if (! -e "$conv_filename") {return -1;} 371 372 my ($tailname, $html_dirname, $suffix) 373 = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$"); 374 375 my $collect_file = &util::filename_within_collection($filename_full_path); 376 my $dirname_within_collection = &util::filename_within_collection($html_dirname); 377 my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"}; 378 379 my @dir; 380 if (!opendir (DIR, $html_dirname)) { 381 print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n"; 382 # just process the original file 383 @dir = ("$tailname.$suffix"); 384 385 } else { 386 @dir = readdir (DIR); 387 closedir (DIR); 388 } 389 390 foreach my $file (@dir) { 391 next unless $file =~ /\.html$/; 392 393 my ($rv, $doc_obj) = 394 $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli); 395 if ((!defined $rv) || ($rv<1)) { 396 # wasn't processed 397 return $rv; 398 } 399 400 # next block copied from ConvertBinaryFile 401 # from here ... 402 # Override previous gsdlsourcefilename set by secondary plugin 403 404 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'}); 405 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 406 # build. so set it manually. 407 $doc_obj->set_source_path($filename_full_path); 408 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file)); 409 410 my $plugin_filename_encoding = $self->{'filename_encoding'}; 411 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 412 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding); 413 414 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 415 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path)); 416 417 418 my ($tailname, $dirname, $suffix) 419 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$"); 420 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname); 421 422 423 my $topsection = $doc_obj->get_top_section(); 424 $self->add_associated_files($doc_obj, $filename_full_path); 425 426 # extra_metadata is already called by sec plugin in process?? 427 $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here?? 428 # do any automatic metadata extraction 429 $self->auto_extract_metadata ($doc_obj); 430 431 # have we found a Title?? 432 $self->title_fallback($doc_obj,$topsection,$filename_no_path); 433 434 # use the one generated by HTMLPlugin, otherwise they all end up with same id. 435 #$self->add_OID($doc_obj); 436 # to here... 437 438 # process it 439 $processor->process($doc_obj); 440 undef $doc_obj; 441 } 442 $self->{'num_processed'} ++; 443 444 # deleted some commented out code here that exists in PowerPointPlugin 445 446 # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created? 447 # as we don't know where it was created. No. Now creating in tmp. 448 $self->clean_up_after_doc_obj_processing(); 449 450 451 # if process_status == 1, then the file has been processed. 452 return 1; 453 454 } 306 307 } 308 455 309 456 310 # use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
Note:
See TracChangeset
for help on using the changeset viewer.