source: gs2-extensions/pdf-box/trunk/java/src/GS_PDFToImagesAndText.java@ 32193

Last change on this file since 32193 was 32193, checked in by ak19, 6 years ago

All the *essential* changes related to the PDFBox modifications Kathy asked for. The PDFBox app used to be used to generated either images for every PDF page or extract txt from the PDF. Kathy wanted to ideally produce paged images with extracted text, where available, so that this would be searchable. So images AND extracted text. Her idea was to modify the pdfbox app code to do it: a new class based on the existing one that generated the images for each page that would (based on Kathy's answers to my questions) need to be modified to additionally extract the text of each page, so that txt search results matched the correct img page presented. Might as well upgrade the pdfbox app version our GS code used. After testing that the latest version (2.09) did not have any of the issues for which we previously settled on v 1.8.2 (lower than the then most up to date version), the necessary code changes were made. All of these are documented in the newly included GS_PDFBox_README.txt. The new java file is called GS_PDFToImagesAndText.java and is located in the new java/src subfolder. This will need to be put into the pdfbox app 2.09 *src* code to be built, and the generated class file should then be copied into the java/lib/java/pdfbox-app.jar, all as explained in the GS_PDFBox_README.txt. Other files modified for the changes requested by Kathy are PDFBoxConvertger.pm, to refer to our new class and its new java package location as packages have changed in 2.09, and util.pm's create_itemfile() function which now may additionally deal with txt files matching each img file generated. (Not committing minor adjustment to ReadTextFile.pm to prevent a warning, as my fix seems hacky. But the fix is described in the Readme). The pdfbox ext zip/tarballs also modified to contain the changed PDFBoxConverter.pm and pdfbox-app jar file for 2.09 with our custom new class file. But have not yet renamed anything to gs-pdfbox-app as there will be flow on effects elsewhere as described in the Readme, can do all this in a separate commit.

File size: 14.5 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17package org.apache.pdfbox.tools;
18
19import java.awt.HeadlessException;
20import java.awt.Toolkit;
21import java.awt.image.BufferedImage;
22import java.io.File;
23import java.io.FileOutputStream;
24import java.io.IOException;
25import java.io.OutputStreamWriter;
26import java.io.Writer;
27import org.apache.pdfbox.io.IOUtils;
28
29import javax.imageio.ImageIO;
30
31import org.apache.pdfbox.pdmodel.PDDocument;
32import org.apache.pdfbox.pdmodel.PDPage;
33import org.apache.pdfbox.pdmodel.common.PDRectangle;
34import org.apache.pdfbox.rendering.ImageType;
35import org.apache.pdfbox.rendering.PDFRenderer;
36import org.apache.pdfbox.tools.imageio.ImageIOUtil;
37
38import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
39import org.apache.pdfbox.text.PDFTextStripper;
40
41/**
42 * This class is based on PDFToImage.java which converts
43 * the pages of a PDF document to images.
44 * This class should convert the pages to images and
45 * extract the text of each page. That part of the code
46 * to be taken from ExtractText.java
47 *
48 * Built on Apache PDFBox's PDFToImage.java with minor modifications.
49 * ak19
50 */
51public final class GS_PDFToImagesAndText
52{
53 private static final String PASSWORD = "-password";
54 private static final String ENCODING = "-encoding";
55 private static final String START_PAGE = "-startPage";
56 private static final String END_PAGE = "-endPage";
57 private static final String PAGE = "-page";
58 private static final String IMAGE_TYPE = "-imageType";
59 private static final String FORMAT = "-format";
60 private static final String OUTPUT_PREFIX = "-outputPrefix";
61 private static final String PREFIX = "-prefix";
62 private static final String COLOR = "-color";
63 private static final String RESOLUTION = "-resolution";
64 private static final String DPI = "-dpi";
65 private static final String CROPBOX = "-cropbox";
66 private static final String TIME = "-time";
67
68 private static final String STD_ENCODING = "UTF-8";
69
70 /**
71 * private constructor.
72 */
73 private GS_PDFToImagesAndText()
74 {
75 //static class
76 }
77
78 /**
79 * Infamous main method.
80 *
81 * @param args Command line arguments, should be one and a reference to a file.
82 *
83 * @throws IOException If there is an error parsing the document.
84 */
85 public static void main( String[] args ) throws IOException
86 {
87 try
88 {
89 // force KCMS (faster than LCMS) if available
90 Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
91 System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
92 }
93 catch (ClassNotFoundException e)
94 {
95 // do nothing
96 }
97
98 // suppress the Dock icon on OS X
99 System.setProperty("apple.awt.UIElement", "true");
100
101 String password = "";
102 String encoding = STD_ENCODING;
103 String pdfFile = null;
104 String outputPrefix = null;
105 String imageFormat = "jpg";
106 int startPage = 1;
107 int endPage = Integer.MAX_VALUE;
108 String color = "rgb";
109 int dpi;
110 float cropBoxLowerLeftX = 0;
111 float cropBoxLowerLeftY = 0;
112 float cropBoxUpperRightX = 0;
113 float cropBoxUpperRightY = 0;
114 boolean showTime = false;
115 try
116 {
117 dpi = Toolkit.getDefaultToolkit().getScreenResolution();
118 }
119 catch( HeadlessException e )
120 {
121 dpi = 96;
122 }
123 for( int i = 0; i < args.length; i++ )
124 {
125 if( args[i].equals( PASSWORD ) )
126 {
127 i++;
128 if( i >= args.length )
129 {
130 usage();
131 }
132 password = args[i];
133 }
134 else if( args[i].equals( ENCODING ) )
135 {
136 i++;
137 if( i >= args.length )
138 {
139 usage();
140 }
141 encoding = args[i];
142 }
143 else if( args[i].equals( START_PAGE ) )
144 {
145 i++;
146 if( i >= args.length )
147 {
148 usage();
149 }
150 startPage = Integer.parseInt( args[i] );
151 }
152 else if( args[i].equals( END_PAGE ) )
153 {
154 i++;
155 if( i >= args.length )
156 {
157 usage();
158 }
159 endPage = Integer.parseInt( args[i] );
160 }
161 else if( args[i].equals( PAGE ) )
162 {
163 i++;
164 if( i >= args.length )
165 {
166 usage();
167 }
168 startPage = Integer.parseInt( args[i] );
169 endPage = Integer.parseInt( args[i] );
170 }
171 else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )
172 {
173 i++;
174 imageFormat = args[i];
175 }
176 else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )
177 {
178 i++;
179 outputPrefix = args[i];
180 }
181 else if( args[i].equals( COLOR ) )
182 {
183 i++;
184 color = args[i];
185 }
186 else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )
187 {
188 i++;
189 dpi = Integer.parseInt(args[i]);
190 }
191 else if( args[i].equals( CROPBOX ) )
192 {
193 i++;
194 cropBoxLowerLeftX = Float.valueOf(args[i]);
195 i++;
196 cropBoxLowerLeftY = Float.valueOf(args[i]);
197 i++;
198 cropBoxUpperRightX = Float.valueOf(args[i]);
199 i++;
200 cropBoxUpperRightY = Float.valueOf(args[i]);
201 }
202 else if( args[i].equals( TIME ) )
203 {
204 showTime = true;
205 }
206 else
207 {
208 if( pdfFile == null )
209 {
210 pdfFile = args[i];
211 }
212 }
213 }
214 if( pdfFile == null )
215 {
216 usage();
217 }
218 else
219 {
220 if(outputPrefix == null)
221 {
222 outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
223 }
224
225 PDDocument document = null;
226 try
227 {
228 boolean extractingTextAllowed = true;
229 //String outputFile = null;
230
231 /*startProcessing("Loading PDF "+pdfFile);
232 if( outputFile == null && pdfFile.length() >4 )
233 {
234 outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
235 }*/
236
237 document = PDDocument.load(new File(pdfFile), password);
238
239 AccessPermission ap = document.getCurrentAccessPermission();
240 if( ! ap.canExtractContent() )
241 {
242 //throw new IOException( "You do not have permission to extract text" );
243 System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
244 extractingTextAllowed = false;
245 }
246 //stopProcessing("Time for loading: ", startTime);
247
248 // don't extract to HTML in this class, just extract to txt
249 PDFTextStripper stripper = new PDFTextStripper();
250 //stripper.setSortByPosition( sort );
251 //stripper.setShouldSeparateByBeads( separateBeads );
252 stripper.setShouldSeparateByBeads( true );
253
254
255 ImageType imageType = null;
256 if ("bilevel".equalsIgnoreCase(color))
257 {
258 imageType = ImageType.BINARY;
259 }
260 else if ("gray".equalsIgnoreCase(color))
261 {
262 imageType = ImageType.GRAY;
263 }
264 else if ("rgb".equalsIgnoreCase(color))
265 {
266 imageType = ImageType.RGB;
267 }
268 else if ("rgba".equalsIgnoreCase(color))
269 {
270 imageType = ImageType.ARGB;
271 }
272
273 if (imageType == null)
274 {
275 System.err.println( "Error: Invalid color." );
276 System.exit( 2 );
277 }
278
279 //if a CropBox has been specified, update the CropBox:
280 //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
281 if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0
282 || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
283 {
284 changeCropBox(document,
285 cropBoxLowerLeftX, cropBoxLowerLeftY,
286 cropBoxUpperRightX, cropBoxUpperRightY);
287 }
288
289 long startTime = System.nanoTime();
290
291 // render the pages
292 boolean success = true;
293 endPage = Math.min(endPage, document.getNumberOfPages());
294 PDFRenderer renderer = new PDFRenderer(document);
295 for (int i = startPage - 1; i < endPage; i++)
296 {
297 // turn page into image
298 BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
299 int lastSlash = outputPrefix.lastIndexOf(File.separator);
300 outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
301 String fileName = outputPrefix + (i + 1) + ".";
302 success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
303
304
305 // image version of page done, now extract text from current page
306 if(extractingTextAllowed) {
307 Writer output = null;
308 try {
309 output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
310 stripper.setStartPage( i+1 );
311 stripper.setEndPage( i+1 );
312
313 //if (debug)
314 //{
315 System.err.println("Writing to "+fileName);
316 //}
317
318 // Extract text for main document, the specified pages
319 stripper.writeText( document, output );
320 } catch (Exception ex) {
321 System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
322 } finally {
323 IOUtils.closeQuietly(output);
324 }
325 }
326 }
327
328 // GS NOTE: We just extracted text for (each page of) the main document, but
329 // we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
330
331 // performance stats
332 long endTime = System.nanoTime();
333 long duration = endTime - startTime;
334 int count = 1 + endPage - startPage;
335 if (showTime)
336 {
337 System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
338 duration / 1000000);
339 }
340
341 if (!success)
342 {
343 System.err.println( "Error: no writer found for image format '"
344 + imageFormat + "'" );
345 System.exit(1);
346 }
347 }
348 finally
349 {
350 if( document != null )
351 {
352 document.close();
353 }
354 }
355 }
356 }
357
358 /**
359 * This will print the usage requirements and exit.
360 */
361 private static void usage()
362 {
363 String message = "Usage: java -jar pdfbox-app-x.y.z.jar GS_PDFToImagesAndText [options] <inputfile>\n"
364 + "\nOptions:\n"
365 + " -password <password> : Password to decrypt document\n"
366 + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
367 + " -format <string> : Image format: " + getImageFormats() + "\n"
368 + " -prefix <string> : Filename prefix for image files\n"
369 + " -page <number> : The only page to extract (1-based)\n"
370 + " -startPage <int> : The first page to start extraction (1-based)\n"
371 + " -endPage <int> : The last page to extract(inclusive)\n"
372 + " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
373 + " -dpi <int> : The DPI of the output image\n"
374 + " -cropbox <int> <int> <int> <int> : The page area to export\n"
375 + " -time : Prints timing information to stdout\n"
376 + " <inputfile> : The PDF document to use\n";
377
378 System.err.println(message);
379 System.exit( 1 );
380 }
381
382 private static String getImageFormats()
383 {
384 StringBuilder retval = new StringBuilder();
385 String[] formats = ImageIO.getReaderFormatNames();
386 for( int i = 0; i < formats.length; i++ )
387 {
388 if (formats[i].equalsIgnoreCase(formats[i]))
389 {
390 retval.append( formats[i] );
391 if( i + 1 < formats.length )
392 {
393 retval.append( ", " );
394 }
395 }
396 }
397 return retval.toString();
398 }
399
400 private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
401 {
402 for (PDPage page : document.getPages())
403 {
404 System.out.println("resizing page");
405 PDRectangle rectangle = new PDRectangle();
406 rectangle.setLowerLeftX(a);
407 rectangle.setLowerLeftY(b);
408 rectangle.setUpperRightX(c);
409 rectangle.setUpperRightY(d);
410 page.setCropBox(rectangle);
411
412 }
413 }
414}
Note: See TracBrowser for help on using the repository browser.