Context Navigation

← Previous Change
Next Change →

gsConvert.pl

Timestamp:

2018-07-16T21:28:00+12:00 (6 years ago)

Author:

ak19

Message:

First attempt at PDFv2Plugin.pm.

File:

: 1 edited

main/trunk/greenstone2/bin/script/gsConvert.pl (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/bin/script/gsConvert.pl

-              r32273
+              r32277
     print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
     print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
     print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
+    print STDERR "\t-output\tauto|html|pretty_html|paged_pretty_html|paged_html|text|paged_text|pagedimg_jpg|pagedimg_gif|pagedimg_png|pagedimgtxt_jpg|pagedimgtxt_png\t(output file type)\n";
     print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
     print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
 …
     my $success = 0;
     $output_type =~ s/.*\-(.*)/$1/i;
+    print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n";
     # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
     # and then decide which conversion command to run based on the output type
 …
+  }
+  elsif ($pdf_tool eq "xpdftools" ) {
+    # default to html output
+    elsif ($pdf_tool eq "xpdftools" ) {
+    # default to pretty html output
     if (!$output_type) {
         $output_type = "html";
+        $output_type = "pretty_html";
+    }
 …
     #}
     # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.
     if ($output_type =~ m/^(paged_html|html)$/i) {
+    # Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools.
+    if ($output_type =~ m/pretty_html$/i) {
         $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
         if ($success) {
 …
     # Attempt conversion to TEXT
+    if (!$output_type || ($output_type =~ m/text/i)) {
+        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
+    # Proper paged_text processing not yet implemented with xpdf
+    if ($output_type =~ m/text/i) {
+        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type);
         if ($success) {
 …
+        }
+    }
+  }
+    }
     return "fail";
 …
 # Works for Windows too, whereas the old pdftotxt didn't
 sub xpdf_to_text {
     my ($dirname, $input_filename, $output_filestem) = @_;
+    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
     my $cmd = "";
 …
         $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
+    }
+    if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers
     $cmd .= " -nopgbrk";
+    }
     # Avoid the silly solitary carriage returns (CR in Notepad) at the end
     # of lines that ends up as \n appended to the doc title

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 32277 for main/trunk/greenstone2/bin/script/gsConvert.pl

Legend:

main/trunk/greenstone2/bin/script/gsConvert.pl

Download in other formats: