Changeset 31766 for main/trunk


Ignore:
Timestamp:
2017-06-29T19:50:02+12:00 (7 years ago)
Author:
ak19
Message:
  1. Refactored ConvertBinaryFile:tmp_area_convert_file() to do the conversion command (final portion of the function) in a separate subroutine, the new run_conversion_command(), so that the new subclass UnknownConverterPlugin can override this method. 2. UnknownConverterPlugin currently does the unique portions of its previously overridden tmp_area_convert_file() in the new run_conversion_command() that it now overrides, since it inherits all of tmp_area_convert_file(). 3. Removed currently unused overridden read() method. Also removed some other unwanted things.
Location:
main/trunk/greenstone2/perllib/plugins
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r31761 r31766  
    263263    my $ensure_path_absolute = 1; # true
    264264    &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
     265
     266    my $output_filename = $self->run_conversion_command($tmp_dirname, $tmp_filename,
     267                            $utf8_tailname, $lc_suffix, $tailname, $suffix);
     268
     269    return $output_filename;
     270}
     271
     272# The latter half of tmp_area_convert_file: runs the conversion command and returns the output file name
     273# Split from tmp_area_convert_file because UnknownConverterPlugin can then inherit all of
     274# tmp_area_convert_file and only needs to override this part:
     275sub run_conversion_command {
     276    my $self = shift (@_);
     277    my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;   
     278
     279    my $outhandle = $self->{'outhandle'};
     280    my $convert_to = $self->{'convert_to'};
     281    my $failhandle = $self->{'failhandle'};
     282
    265283    my $verbosity = $self->{'verbosity'};
    266284    if ($verbosity > 0) {
  • main/trunk/greenstone2/perllib/plugins/UnknownConverterPlugin.pm

    r31764 r31766  
    4545# At present, a file or folder of files is assumed.
    4646# Need to look in there for files with extension process_ext.
    47 # Do we also need a html_multi option to convert_to? Support html_multi as output?
     47# Do we also need a html_multi option to convert_to? If supporting html_multi as output,
     48# see PowerPointPlugin::read(), and revision 31764 of UnknownConverterPlugin.pm
    4849# Then a folder of html files is generated per document?
    4950# OR Flag that indicates whether an html file + associated folder (such as of images) gets generated. And name of assoc folder. Such output gets generated for instance when a doc file is replaced by its html version.
     
    104105
    105106my $outhandle = $self->{'outhandle'};
    106     print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
    107107    if(!defined $self->{'convert_to'}) {
    108     $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set at the start of this file???????
    109     }
     108    $self->{'convert_to'} = "text"; # why do I have to set a value for convert_to here, when a default's already set in $convert_to_list declaration????
     109    }
     110    #print STDERR "\n\n**** convert_to is |" . $self->{'convert_to'} . "|\n\n";
    110111
    111112    # Convert_To set up, including secondary_plugins for processing the text or html generated
     
    151152
    152153# Are init, begin and deinit necessary (will they not get called automatically)?
     154# Dr Bainbridge says it doesn't hurt for these to be explicitly defined here.
    153155# Copied here from PDFPlugin, PowerPointPlugin
    154156# https://stackoverflow.com/questions/42885207/why-doesnt-class-supernew-call-the-constructors-of-all-parent-classes-when
     
    176178}
    177179
    178 # overridden to run the custom conversion command here in place of gsConvert.pl called by ConvertBinaryFile.pm
    179 sub tmp_area_convert_file {
    180     # should we first hardlink the output files/folder to tmp area, so we won't be working across drives?
    181 
    182     my $self = shift (@_);
    183     my ($output_ext, $input_filename, $textref) = @_;
    184 
    185     #### COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
     180# Called by ConvertBinaryFile::tmp_area_convert_file() to do the actual conversion
     181# In order to call the custom conversion command, UnknownConverterPlugin needs to know the actual
     182# input filename (which is the tmp_filename parameter) and the output file name, which this subroutine
     183# will work out. Then it will run the conversion command.
     184sub run_conversion_command {
     185    my $self = shift (@_);
     186    my ($tmp_dirname, $tmp_filename, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;   
     187   
    186188    my $outhandle = $self->{'outhandle'};
    187189    my $convert_to = $self->{'convert_to'};
    188190    my $failhandle = $self->{'failhandle'};
    189     my $convert_to_ext = $self->{'convert_to_ext'}; #set by ConvertBinaryFile::set_standard_convert_settings()
    190    
    191 
    192     my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
    193 
    194     # derive tmp filename from input filename
    195     my ($tailname, $dirname, $suffix)
    196     = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
    197 
    198     # softlink to collection tmp dir
    199     my $tmp_dirname = &util::get_timestamped_tmp_folder();
    200     if (defined $tmp_dirname) {
    201     $self->{'tmp_dir'} = $tmp_dirname;
    202     } else {
    203     $tmp_dirname = $dirname;
    204     }
    205    
    206 #    # convert to utf-8 otherwise we have problems with the doc.xml file later on
    207 #    my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
    208 
    209     # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
    210      my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
    211 
    212 
    213     # URLEncode this since htmls with images where the html filename is utf8 don't seem
    214     # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
    215     # files on the filesystem.
    216     $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
    217 
    218     my $lc_suffix = lc($suffix);
    219     my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$utf8_tailname$lc_suffix");
    220    
    221     # If gsdl is remote, we're given relative path to input file, of the form import/utf8_tailname.suffix
    222     # But we can't softlink to relative paths. Therefore, we need to ensure that
    223     # the input_filename is the absolute path, see http://perldoc.perl.org/File/Spec.html
    224     my $ensure_path_absolute = 1; # true
    225     &FileUtils::softLink($input_filename, $tmp_filename, $ensure_path_absolute);
    226191    my $verbosity = $self->{'verbosity'};
     192   
     193    my $convert_to_ext = $self->{'convert_to_ext'};
    227194    if ($verbosity > 0) {
    228195    print $outhandle "Converting $tailname$suffix to $convert_to format with extension $convert_to_ext\n";
    229196    }
    230197
    231     my $errlog = &FileUtils::filenameConcatenate($tmp_dirname, "err.log");
    232    
    233  
     198    # The command to be executed must be provided the input filename and output file/dir name
     199    # input filename = tmp_filename
     200    # 1. We now work out the output filename. Code for it comes from
     201    # ConvertBinaryFile::tmp_area_convert_file(), but slightly modified
     202
    234203    my $output_type=$self->{'convert_to'};
    235204
     
    259228    }
    260229
    261     #### END COPIED FROM ConvertBinaryFile::tmp_area_convert_file()
    262 
    263     # Execute the conversion command and get the type of the result,
     230
     231    # 2. Execute the conversion command and get the type of the result,
    264232    # making sure the converter gives us the appropriate output type
    265233
     
    276244    return "";
    277245    }
    278 
    279     # HARDCODING CMD FOR NOW
    280     #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html \"/Scratch/ak19/tutorial_sample_files/pdfbox/A9-access-best-practices.pdf\" \"/Scratch/ak19/gs3-svn-15Nov2016/pdf-tmp/1.html\"";
    281 
    282     #$cmd ="/Scratch/ak19/gs3-svn-15Nov2016/packages/jre/bin/java -cp \"/Scratch/ak19/gs3-svn-15Nov2016/gs2build/ext/pdf-box/lib/java/pdfbox-app.jar\" -Dline.separator=\"<br />\" org.apache.pdfbox.ExtractText -html %INPUT_FILE %OUTPUT";
    283246
    284247    # replace occurrences of placeholders in cmd string
     
    294257    if ($self->{'verbosity'} > 2) {
    295258    print STDERR "$plugin_name: executing conversion cmd \n|$cmd|\n";
    296     print STDERR "   on infile |$input_filename|\n";
     259    print STDERR "   on infile |$tmp_filename|\n";
    297260    print STDERR "   to produce expected $output_filename\n";
    298261    }
     
    341304   
    342305    return $output_filename;
    343 }
    344 
    345 # Copied from PowerPointPlugin, with some modifications
    346 # override default read in some situations, as the conversion of ppt to html results in many files, and we want them all to be processed.
    347 sub read {
    348     my $self = shift (@_); 
    349     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    350    
    351     # can we process this file??
    352     my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    353    
    354     return undef unless $self->can_process_this_file($filename_full_path);
    355    
    356     my $is_output_dir = (defined $self->{'output_dirname'}) ? 1 : 0;
    357 
    358     # we are only doing something special if we have a directory of html files
    359     #if ($is_output_dir || $self->{'convert_to'} ne "html") {
    360     if ($self->{'convert_to'} ne "html_multi") {
    361     return $self->BaseImporter::read(@_); # no read in ConvertBinaryFile.pm
    362     }
    363     my $outhandle = $self->{'outhandle'};
    364     print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
    365     print $outhandle "$self->{'plugin_type'} processing $file\n"
    366         if $self->{'verbosity'} > 1;
    367 
    368     my $conv_filename = $self->tmp_area_convert_file("html", $filename_full_path); # uses our overridden version
    369     if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
    370     if (! -e "$conv_filename") {return -1;}
    371 
    372     my ($tailname, $html_dirname, $suffix)
    373     = &File::Basename::fileparse($conv_filename, "\\.[^\\.]+\$");
    374 
    375     my $collect_file = &util::filename_within_collection($filename_full_path);
    376     my $dirname_within_collection = &util::filename_within_collection($html_dirname);
    377     my $secondary_plugin = $self->{'secondary_plugins'}->{"HTMLPlugin"};
    378 
    379     my @dir;
    380     if (!opendir (DIR, $html_dirname)) {
    381     print $outhandle "PowerPointPlugin: Couldn't read directory $html_dirname\n";
    382     # just process the original file
    383     @dir = ("$tailname.$suffix");
    384    
    385     } else {
    386     @dir = readdir (DIR);
    387     closedir (DIR);
    388     }
    389 
    390     foreach my $file (@dir) {
    391     next unless $file =~ /\.html$/;
    392    
    393     my ($rv, $doc_obj) =
    394         $secondary_plugin->read_into_doc_obj ($pluginfo,"", &util::filename_cat($html_dirname,$file), $block_hash, {}, $processor, $maxdocs, $total_count, $gli);
    395     if ((!defined $rv) || ($rv<1)) {
    396         # wasn't processed
    397         return $rv;
    398     }
    399 
    400     # next block copied from ConvertBinaryFile
    401     # from here ...
    402     # Override previous gsdlsourcefilename set by secondary plugin
    403    
    404     $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
    405     ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
    406     # build. so set it manually.
    407     $doc_obj->set_source_path($filename_full_path);
    408     $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
    409    
    410     my $plugin_filename_encoding = $self->{'filename_encoding'};
    411     my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    412     $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
    413        
    414     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    415     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
    416 
    417    
    418     my ($tailname, $dirname, $suffix)
    419         = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
    420     $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FilenameRoot", $tailname);
    421    
    422 
    423     my $topsection = $doc_obj->get_top_section();
    424     $self->add_associated_files($doc_obj, $filename_full_path);
    425    
    426     # extra_metadata is already called by sec plugin in process??
    427     $self->extra_metadata($doc_obj, $topsection, $metadata); # do we need this here??
    428     # do any automatic metadata extraction
    429     $self->auto_extract_metadata ($doc_obj);
    430    
    431     # have we found a Title??
    432     $self->title_fallback($doc_obj,$topsection,$filename_no_path);
    433    
    434     # use the one generated by HTMLPlugin, otherwise they all end up with same id.
    435     #$self->add_OID($doc_obj);
    436     # to here...
    437 
    438     # process it
    439     $processor->process($doc_obj);
    440     undef $doc_obj;
    441     }
    442     $self->{'num_processed'} ++;
    443 
    444     # deleted some commented out code here that exists in PowerPointPlugin
    445 
    446     # for UnknownConverterPlugin, don't delete any temp files that the conversion may have created?
    447     # as we don't know where it was created. No. Now creating in tmp.
    448     $self->clean_up_after_doc_obj_processing();
    449 
    450 
    451     # if process_status == 1, then the file has been processed.
    452     return 1;
    453 
    454 }
     306
     307}
     308
    455309
    456310# use the read_into_doc_obj inherited from ConvertBinaryFile "to call secondary plugin stuff"
Note: See TracChangeset for help on using the changeset viewer.