Changeset 31766

Show
Ignore:
Timestamp:
29.06.2017 19:50:02 (3 months ago)
Author:
ak19
Message:

1. Refactored ConvertBinaryFile:tmp_area_convert_file() to do the conversion command (final portion of the function) in a separate subroutine, the new run_conversion_command(), so that the new subclass UnknownConverterPlugin? can override this method. 2. UnknownConverterPlugin? currently does the unique portions of its previously overridden tmp_area_convert_file() in the new run_conversion_command() that it now overrides, since it inherits all of tmp_area_convert_file(). 3. Removed currently unused overridden read() method. Also removed some other unwanted things.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r31761 r31766  
    263263    my $ensure_path_absolute = 1; # true 
    264264    &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute); 
     265 
     266    my $output_filename = $self->run_conversion_command($tmp_dirname, $tmp_filename, 
     267                            $utf8_tailname, $lc_suffix, $tailname, $suffix); 
     268 
     269    return $output_filename; 
     270} 
     271 
     272# The latter half of tmp_area_convert_file: runs the conversion command and returns the output file name 
     273# Split from tmp_area_convert_file because UnknownConverterPlugin can then inherit all of  
     274# tmp_area_convert_file and only needs to override this part: 
     275sub run_conversion_command { 
     276    my $self = shift (@_); 
     277    my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;     
     278 
     279    my $outhandle = $self->{'outhandle'}; 
     280    my $convert_to = $self->{'convert_to'}; 
     281    my $failhandle = $self->{'failhandle'}; 
     282 
    265283    my $verbosity = $self->{'verbosity'}; 
    266284    if ($verbosity > 0) { 
  • main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm

    r31764 r31766  
    4545# At present, a file or folder of files is assumed. 
    4646# Need to look in there for files with extension process_ext. 
    47 # Do we also need a html_multi option to convert_to? Support html_multi as output? 
     47# Do we also need a html_multi option to convert_to? If supporting html_multi as output,  
     48# see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm 
    4849# Then a folder of html files is generated per document?  
    4950# OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version. 
     
    104105 
    105106my $outhandle = $self->{'outhandle'}; 
    106     print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n"; 
    107107    if(!defined $self->{'convert_to'}) { 
    108     $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file??????? 
    109     } 
     108    $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set in $convert_to_list declaration???? 
     109    } 
     110    #print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n"; 
    110111 
    111112    # Convert_To set up, including secondary_plugins for processing the text or html generated 
     
    151152 
    152153# Are init, begin and deinit necessary (will they not get called automatically)? 
     154# Dr Bainbridge says it doesn't hurt for these to be explicitly defined here. 
    153155# Copied here from PDFPlugin, PowerPointPlugin 
    154156# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when 
     
    176178} 
    177179 
    178 # overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm 
    179 sub tmp_area_convert_file { 
    180     # should we first hardlink the output files/folder to tmp area, so we won't be working across drives? 
    181  
    182     my $self = shift (@_); 
    183     my ($output_ext, $input_filename, $textref) = @_; 
    184  
    185     #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file() 
     180# Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion 
     181# In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual  
     182# input filename (which is the tmp_filename parameter) and the output file name, which this subroutine 
     183# will work out. Then it will run the conversion command. 
     184sub run_conversion_command { 
     185    my $self = shift (@_); 
     186    my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;     
     187     
    186188    my $outhandle = $self->{'outhandle'}; 
    187189    my $convert_to = $self->{'convert_to'}; 
    188190    my $failhandle = $self->{'failhandle'}; 
    189     my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings() 
    190      
    191  
    192     my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename); 
    193  
    194     # derive tmp filename from input filename 
    195     my ($tailname, $dirname, $suffix) 
    196     = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$"); 
    197  
    198     # softlink to collection tmp dir 
    199     my $tmp_dirname = &util::get_timestamped_tmp_folder(); 
    200     if (defined $tmp_dirname) { 
    201     $self->{'tmp_dir'} = $tmp_dirname; 
    202     } else { 
    203     $tmp_dirname = $dirname; 
    204     } 
    205      
    206 #    # convert to utf-8 otherwise we have problems with the doc.xml file later on 
    207 #    my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname); 
    208  
    209     # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file 
    210      my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname); 
    211  
    212  
    213     # URLEncode this since htmls with images where the html filename is utf8 don't seem 
    214     # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded 
    215     # files on the filesystem. 
    216     $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix"); 
    217  
    218     my $lc_suffix = lc($suffix); 
    219     my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix"); 
    220      
    221     # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix 
    222     # But we can't softlink to relative paths. Therefore, we need to ensure that 
    223     # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html 
    224     my $ensure_path_absolute = 1; # true 
    225     &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute); 
    226191    my $verbosity = $self->{'verbosity'}; 
     192     
     193    my $convert_to_ext = $self->{'convert_to_ext'}; 
    227194    if ($verbosity > 0) { 
    228195    print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n"; 
    229196    } 
    230197 
    231     my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log"); 
    232      
    233   
     198    # The command to be executed must be provided the input filename and output file/dir name 
     199    # input filename = tmp_filename 
     200    # 1. We now work out the output filename. Code for it comes from 
     201    # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified 
     202 
    234203    my $output_type=$self->{'convert_to'}; 
    235204 
     
    259228    } 
    260229 
    261     #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file() 
    262  
    263     # Execute the conversion command and get the type of the result, 
     230 
     231    # 2. Execute the conversion command and get the type of the result, 
    264232    # making sure the converter gives us the appropriate output type 
    265233 
     
    276244    return ""; 
    277245    } 
    278  
    279     # HARDCODING CMD FOR NOW 
    280     #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\""; 
    281  
    282     #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html %INPUT_FILE %OUTPUT"; 
    283246 
    284247    # replace occurrences of placeholders in cmd string 
     
    294257    if ($self->{'verbosity'} > 2) { 
    295258    print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n"; 
    296     print STDERR "   on infile |$input_filename|\n"; 
     259    print STDERR "   on infile |$tmp_filename|\n"; 
    297260    print STDERR "   to produce expected $output_filename\n"; 
    298261    } 
     
    341304     
    342305    return $output_filename; 
    343 } 
    344  
    345 # Copied from PowerPointPlugin, with some modifications 
    346 # override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed. 
    347 sub read { 
    348     my $self = shift (@_);   
    349     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    350      
    351     # can we process this file?? 
    352     my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    353      
    354     return undef unless $self->can_process_this_file($filename_full_path); 
    355      
    356     my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0; 
    357  
    358     # we are only doing something special if we have a directory of html files 
    359     #if ($is_output_dir || $self->{'convert_to'} ne "html") { 
    360     if ($self->{'convert_to'} ne "html_multi") { 
    361     return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm 
    362     } 
    363     my $outhandle = $self->{'outhandle'}; 
    364     print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 
    365     print $outhandle "$self->{'plugin_type'} processing $file\n" 
    366         if $self->{'verbosity'} > 1; 
    367  
    368     my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version 
    369     if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline  
    370     if (! -e "$conv_filename") {return -1;}  
    371  
    372     my ($tailname, $html_dirname, $suffix) 
    373     = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$"); 
    374  
    375     my $collect_file = &util::filename_within_collection($filename_full_path); 
    376     my $dirname_within_collection = &util::filename_within_collection($html_dirname); 
    377     my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"}; 
    378  
    379     my @dir; 
    380     if (!opendir (DIR, $html_dirname)) { 
    381     print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n"; 
    382     # just process the original file 
    383     @dir = ("$tailname.$suffix"); 
    384      
    385     } else { 
    386     @dir = readdir (DIR); 
    387     closedir (DIR); 
    388     } 
    389  
    390     foreach my $file (@dir) { 
    391     next unless $file =~ /\.html$/; 
    392      
    393     my ($rv, $doc_obj) =  
    394         $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli); 
    395     if ((!defined $rv) || ($rv<1)) { 
    396         # wasn't processed 
    397         return $rv; 
    398     } 
    399  
    400     # next block copied from ConvertBinaryFile 
    401     # from here ... 
    402     # Override previous gsdlsourcefilename set by secondary plugin 
    403      
    404     $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});  
    405     ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 
    406     # build. so set it manually. 
    407     $doc_obj->set_source_path($filename_full_path); 
    408     $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file)); 
    409      
    410     my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    411     my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    412     $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding); 
    413          
    414     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
    415     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path)); 
    416  
    417      
    418     my ($tailname, $dirname, $suffix) 
    419         = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$"); 
    420     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname); 
    421      
    422  
    423     my $topsection = $doc_obj->get_top_section(); 
    424     $self->add_associated_files($doc_obj, $filename_full_path); 
    425      
    426     # extra_metadata is already called by sec plugin in process?? 
    427     $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here?? 
    428     # do any automatic metadata extraction 
    429     $self->auto_extract_metadata ($doc_obj); 
    430      
    431     # have we found a Title?? 
    432     $self->title_fallback($doc_obj,$topsection,$filename_no_path); 
    433      
    434     # use the one generated by HTMLPlugin, otherwise they all end up with same id. 
    435     #$self->add_OID($doc_obj); 
    436     # to here... 
    437  
    438     # process it 
    439     $processor->process($doc_obj); 
    440     undef $doc_obj; 
    441     } 
    442     $self->{'num_processed'} ++; 
    443  
    444     # deleted some commented out code here that exists in PowerPointPlugin 
    445  
    446     # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created? 
    447     # as we don't know where it was created. No. Now creating in tmp. 
    448     $self->clean_up_after_doc_obj_processing(); 
    449  
    450  
    451     # if process_status == 1, then the file has been processed. 
    452     return 1; 
    453  
    454 } 
     306 
     307} 
     308 
    455309 
    456310# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"