Ignore:
Timestamp:
2018-07-16T21:28:00+12:00 (6 years ago)
Author:
ak19
Message:

First attempt at PDFv2Plugin.pm.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r32273 r32277  
    7878    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
    7979    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
    80     print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
     80    print STDERR "\t-output\tauto|html|pretty_html|paged_pretty_html|paged_html|text|paged_text|pagedimg_jpg|pagedimg_gif|pagedimg_png|pagedimgtxt_jpg|pagedimgtxt_png\t(output file type)\n";
    8181    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
    8282    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
     
    318318    my $success = 0;
    319319    $output_type =~ s/.*\-(.*)/$1/i;
    320 
     320   
     321    print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n";
     322   
    321323    # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
    322324    # and then decide which conversion command to run based on the output type
     
    353355  }
    354356   
    355   elsif ($pdf_tool eq "xpdftools" ) {
    356     # default to html output
     357    elsif ($pdf_tool eq "xpdftools" ) {
     358   
     359    # default to pretty html output
    357360    if (!$output_type) {
    358         $output_type = "html";
     361        $output_type = "pretty_html";
    359362    }
    360363   
     
    367370    #}
    368371   
    369     # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.
    370     if ($output_type =~ m/^(paged_html|html)$/i) {
     372    # Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools.
     373    if ($output_type =~ m/pretty_html$/i) {
    371374        $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
    372375        if ($success) {
     
    376379   
    377380    # Attempt conversion to TEXT
    378     if (!$output_type || ($output_type =~ m/text/i)) {     
    379         $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
     381    # Proper paged_text processing not yet implemented with xpdf
     382    if ($output_type =~ m/text/i) {
     383        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type);
    380384       
    381385        if ($success) {
     
    383387        }
    384388    }
    385   }
    386    
     389    }
     390   
    387391    return "fail";
    388392
     
    10481052# Works for Windows too, whereas the old pdftotxt didn't
    10491053sub xpdf_to_text {
    1050     my ($dirname, $input_filename, $output_filestem) = @_;
     1054    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
    10511055
    10521056    my $cmd = "";
     
    10641068        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
    10651069    }
     1070
     1071    if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers
    10661072    $cmd .= " -nopgbrk";
     1073    }
    10671074    # Avoid the silly solitary carriage returns (CR in Notepad) at the end
    10681075    # of lines that ends up as \n appended to the doc title
Note: See TracChangeset for help on using the changeset viewer.