Context Navigation

← Previous Changeset
Next Changeset →

Changeset 32277

Timestamp:

2018-07-16T21:28:00+12:00 (6 years ago)

Author:

ak19

Message:

First attempt at PDFv2Plugin.pm.

Location:

main/trunk/greenstone2

Files:

: 1 added
: 5 edited

bin/script/gsConvert.pl (modified) (8 diffs)
perllib/plugins/ConvertBinaryFile.pm (modified) (1 diff)
perllib/plugins/PDFPlugin.pm (modified) (1 diff)
perllib/plugins/PDFv1Plugin.pm (modified) (2 diffs)
perllib/plugins/PDFv2Plugin.pm (added)
perllib/strings.properties (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/bin/script/gsConvert.pl

-              r32273
+              r32277
     print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
     print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
     print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
+    print STDERR "\t-output\tauto|html|pretty_html|paged_pretty_html|paged_html|text|paged_text|pagedimg_jpg|pagedimg_gif|pagedimg_png|pagedimgtxt_jpg|pagedimgtxt_png\t(output file type)\n";
     print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
     print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
 …
     my $success = 0;
     $output_type =~ s/.*\-(.*)/$1/i;
+    print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n";
     # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
     # and then decide which conversion command to run based on the output type
 …
+  }
+  elsif ($pdf_tool eq "xpdftools" ) {
+    # default to html output
+    elsif ($pdf_tool eq "xpdftools" ) {
+    # default to pretty html output
     if (!$output_type) {
         $output_type = "html";
+        $output_type = "pretty_html";
+    }
 …
     #}
     # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.
     if ($output_type =~ m/^(paged_html|html)$/i) {
+    # Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools.
+    if ($output_type =~ m/pretty_html$/i) {
         $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
         if ($success) {
 …
     # Attempt conversion to TEXT
+    if (!$output_type || ($output_type =~ m/text/i)) {
+        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
+    # Proper paged_text processing not yet implemented with xpdf
+    if ($output_type =~ m/text/i) {
+        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type);
         if ($success) {
 …
+        }
+    }
+  }
+    }
     return "fail";
 …
 # Works for Windows too, whereas the old pdftotxt didn't
 sub xpdf_to_text {
     my ($dirname, $input_filename, $output_filestem) = @_;
+    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
     my $cmd = "";
 …
         $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
+    }
+    if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers
     $cmd .= " -nopgbrk";
+    }
     # Avoid the silly solitary carriage returns (CR in Notepad) at the end
     # of lines that ends up as \n appended to the doc title

main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

r32206	r32277
161	161	}
162	162
163		if ($convert_to =~ /^html/ \|\| $convert_to ~~eq "paged_html") { # may be html or html_multi, or paged~~_html with the new Xpdf's own pdftohtml
	163	if ($convert_to =~ /^html/ \|\| $convert_to =~ /pretty_html$/) { # may be html or html_multi, or )paged_)pretty_html with the new Xpdf's own pdftohtml
164	164	$self->{'convert_to_plugin'} = "HTMLPlugin";
165	165	$self->{'convert_to_ext'} = "html";

main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

r32275	r32277
159	159	# check convert_to
160	160	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
161		&gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n"~~, "PDFPlugin"~~);
	161	&gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n");
162	162	$self->{'convert_to'} = "html";
163	163	}

main/trunk/greenstone2/perllib/plugins/PDFv1Plugin.pm

-              r32275
+              r32277
        'type' => "flag" },
      { 'name' => "zoom",
        'desc' => "{PDFPlugin.zoom}",
+       'desc' => "{PDFv1Plugin.zoom}",
        'deft' => "2",
        'range' => "1,3", # actually the range is 0.5-3
 …
     # check convert_to
     if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
     &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n", "PDFv1Plugin");
+    &gsprintf::gsprintf(STDERR, "{PDFv1Plugin.win_old_pdftotext_unsupported}\n");
     $self->{'convert_to'} = "html";
+    }

main/trunk/greenstone2/perllib/strings.properties

-              r32275
+              r32277
 ConvertBinaryFile.convert_to.text:Plain text format.
+ConvertBinaryFile.convert_to.paged_text:Text separately extracted for each individual page.
 ConvertBinaryFile.convert_to.pagedimg:A series of images.
 …
 ConvertBinaryFile.convert_to.pagedimg_png:A series of images in PNG format.
+ConvertBinaryFile.convert_to.pagedimgtxt_jpg:A series of images in JPEG format with any extracted text, one for each page.
+ConvertBinaryFile.convert_to.pagedimgtxt_png:A series of images in PNG format with any extracted text, one for each page.
 ConvertBinaryFile.desc:This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin, PostScriptPlugin, RTFPlugin and PDFPlugin. It facilitates the conversion of these document types to either HTML, TEXT or a series of images. It works by dynamically loading an appropriate secondary plugin (HTMLPlugin, StructuredHTMLPlugin, PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
 …
 PDFPlugin.complex:Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).
+PDFPlugin.convert_to.paged_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings.
+PDFPlugin.convert_to.html:HTML. Text only, no images.
+PDFPlugin.convert_to.pretty_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings.
+PDFPlugin.convert_to.paged_pretty_html:Sectionalised variant of pretty_html to allow jumping to individual pages.
 PDFPlugin.deprecated_plugin:*************IMPORTANT******************\nPDFPlugin is being deprecated.\nConsider upgrading to the recommended PDFv2Plugin, which supports newer versions of PDFs.\nAlternatively, if you wish to retain the old style of conversion and are NOT relying on PDFBox,\nchange to PDFv1Plugin.\nIf you are using PDFBox then upgrade to PDFv2Plugin.\n*****************************************\n
 …
 PDFPlugin.use_sections:Create a separate section for each page of the PDF file.
+PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, %s does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you really want pdf to text conversion.
+PDFPlugin.zoom:The factor by which to zoom the PDF for output. If not outputting as paged_html, then zoom is only useful if -complex is set.
+PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, PDFPlugin pdfbox_conversion must be turned on for text output. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin for additional pdf to text conversion options.
+PDFv1Plugin.win_old_pdftotext_unsupported:*** On Windows, PDFv1Plugin does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you want pdf to actual text conversion.
+PDFv1Plugin.zoom:The factor by which to zoom the PDF for output. Only useful if -complex is set.
+PDFv2Plugin.zoom:The factor by which to zoom the PDF for (paged_)pretty_html output. Can be fractional.
+PDFv2Plugin.win_pdftotext_info:PDFv2Plugin uses Xpdf Tools to support pdf to text conversion, including on Windows.
+PDFv2Plugin.conversion_needs_pdfbox:*** Conversion to %s not supported with Xpdf Tools, defaulting to %s. Turn on pdfbox_conversion if you wish to enable output to selected format.
 PostScriptPlugin.desc:This is a \"poor man's\" ps to text converter. If you are serious, consider using the PRESCRIPT package, which is available for download at http://www.nzdl.org/html/software.html

Note: See TracChangeset for help on using the changeset viewer.