1 | /*
|
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
|
---|
3 | * contributor license agreements. See the NOTICE file distributed with
|
---|
4 | * this work for additional information regarding copyright ownership.
|
---|
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
|
---|
6 | * (the "License"); you may not use this file except in compliance with
|
---|
7 | * the License. You may obtain a copy of the License at
|
---|
8 | *
|
---|
9 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
10 | *
|
---|
11 | * Unless required by applicable law or agreed to in writing, software
|
---|
12 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
14 | * See the License for the specific language governing permissions and
|
---|
15 | * limitations under the License.
|
---|
16 | */
|
---|
17 | package org.apache.pdfbox.tools;
|
---|
18 |
|
---|
19 | import java.awt.HeadlessException;
|
---|
20 | import java.awt.Toolkit;
|
---|
21 | import java.awt.image.BufferedImage;
|
---|
22 | import java.io.File;
|
---|
23 | import java.io.FileOutputStream;
|
---|
24 | import java.io.IOException;
|
---|
25 | import java.io.OutputStreamWriter;
|
---|
26 | import java.io.Writer;
|
---|
27 | import org.apache.pdfbox.io.IOUtils;
|
---|
28 |
|
---|
29 | import javax.imageio.ImageIO;
|
---|
30 |
|
---|
31 | import org.apache.pdfbox.pdmodel.PDDocument;
|
---|
32 | import org.apache.pdfbox.pdmodel.PDPage;
|
---|
33 | import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
---|
34 | import org.apache.pdfbox.rendering.ImageType;
|
---|
35 | import org.apache.pdfbox.rendering.PDFRenderer;
|
---|
36 | import org.apache.pdfbox.tools.imageio.ImageIOUtil;
|
---|
37 |
|
---|
38 | import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
|
---|
39 | import org.apache.pdfbox.text.PDFTextStripper;
|
---|
40 |
|
---|
41 | /**
|
---|
42 | * This class is based on PDFToImage.java which converts
|
---|
43 | * the pages of a PDF document to images.
|
---|
44 | * This class should convert the pages to images and
|
---|
45 | * extract the text of each page. That part of the code
|
---|
46 | * to be taken from ExtractText.java
|
---|
47 | *
|
---|
48 | * Built on Apache PDFBox's PDFToImage.java with minor modifications.
|
---|
49 | * ak19
|
---|
50 | */
|
---|
51 | public final class GS_PDFToImagesAndText
|
---|
52 | {
|
---|
53 | private static final String PASSWORD = "-password";
|
---|
54 | private static final String ENCODING = "-encoding";
|
---|
55 | private static final String START_PAGE = "-startPage";
|
---|
56 | private static final String END_PAGE = "-endPage";
|
---|
57 | private static final String PAGE = "-page";
|
---|
58 | private static final String IMAGE_TYPE = "-imageType";
|
---|
59 | private static final String FORMAT = "-format";
|
---|
60 | private static final String OUTPUT_PREFIX = "-outputPrefix";
|
---|
61 | private static final String PREFIX = "-prefix";
|
---|
62 | private static final String COLOR = "-color";
|
---|
63 | private static final String RESOLUTION = "-resolution";
|
---|
64 | private static final String DPI = "-dpi";
|
---|
65 | private static final String CROPBOX = "-cropbox";
|
---|
66 | private static final String TIME = "-time";
|
---|
67 |
|
---|
68 | private static final String STD_ENCODING = "UTF-8";
|
---|
69 |
|
---|
70 | /**
|
---|
71 | * private constructor.
|
---|
72 | */
|
---|
73 | private GS_PDFToImagesAndText()
|
---|
74 | {
|
---|
75 | //static class
|
---|
76 | }
|
---|
77 |
|
---|
78 | /**
|
---|
79 | * Infamous main method.
|
---|
80 | *
|
---|
81 | * @param args Command line arguments, should be one and a reference to a file.
|
---|
82 | *
|
---|
83 | * @throws IOException If there is an error parsing the document.
|
---|
84 | */
|
---|
85 | public static void main( String[] args ) throws IOException
|
---|
86 | {
|
---|
87 | try
|
---|
88 | {
|
---|
89 | // force KCMS (faster than LCMS) if available
|
---|
90 | Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
|
---|
91 | System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
|
---|
92 | }
|
---|
93 | catch (ClassNotFoundException e)
|
---|
94 | {
|
---|
95 | // do nothing
|
---|
96 | }
|
---|
97 |
|
---|
98 | // suppress the Dock icon on OS X
|
---|
99 | System.setProperty("apple.awt.UIElement", "true");
|
---|
100 |
|
---|
101 | String password = "";
|
---|
102 | String encoding = STD_ENCODING;
|
---|
103 | String pdfFile = null;
|
---|
104 | String outputPrefix = null;
|
---|
105 | String imageFormat = "jpg";
|
---|
106 | int startPage = 1;
|
---|
107 | int endPage = Integer.MAX_VALUE;
|
---|
108 | String color = "rgb";
|
---|
109 | int dpi;
|
---|
110 | float cropBoxLowerLeftX = 0;
|
---|
111 | float cropBoxLowerLeftY = 0;
|
---|
112 | float cropBoxUpperRightX = 0;
|
---|
113 | float cropBoxUpperRightY = 0;
|
---|
114 | boolean showTime = false;
|
---|
115 | try
|
---|
116 | {
|
---|
117 | dpi = Toolkit.getDefaultToolkit().getScreenResolution();
|
---|
118 | }
|
---|
119 | catch( HeadlessException e )
|
---|
120 | {
|
---|
121 | dpi = 96;
|
---|
122 | }
|
---|
123 | for( int i = 0; i < args.length; i++ )
|
---|
124 | {
|
---|
125 | if( args[i].equals( PASSWORD ) )
|
---|
126 | {
|
---|
127 | i++;
|
---|
128 | if( i >= args.length )
|
---|
129 | {
|
---|
130 | usage();
|
---|
131 | }
|
---|
132 | password = args[i];
|
---|
133 | }
|
---|
134 | else if( args[i].equals( ENCODING ) )
|
---|
135 | {
|
---|
136 | i++;
|
---|
137 | if( i >= args.length )
|
---|
138 | {
|
---|
139 | usage();
|
---|
140 | }
|
---|
141 | encoding = args[i];
|
---|
142 | }
|
---|
143 | else if( args[i].equals( START_PAGE ) )
|
---|
144 | {
|
---|
145 | i++;
|
---|
146 | if( i >= args.length )
|
---|
147 | {
|
---|
148 | usage();
|
---|
149 | }
|
---|
150 | startPage = Integer.parseInt( args[i] );
|
---|
151 | }
|
---|
152 | else if( args[i].equals( END_PAGE ) )
|
---|
153 | {
|
---|
154 | i++;
|
---|
155 | if( i >= args.length )
|
---|
156 | {
|
---|
157 | usage();
|
---|
158 | }
|
---|
159 | endPage = Integer.parseInt( args[i] );
|
---|
160 | }
|
---|
161 | else if( args[i].equals( PAGE ) )
|
---|
162 | {
|
---|
163 | i++;
|
---|
164 | if( i >= args.length )
|
---|
165 | {
|
---|
166 | usage();
|
---|
167 | }
|
---|
168 | startPage = Integer.parseInt( args[i] );
|
---|
169 | endPage = Integer.parseInt( args[i] );
|
---|
170 | }
|
---|
171 | else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )
|
---|
172 | {
|
---|
173 | i++;
|
---|
174 | imageFormat = args[i];
|
---|
175 | }
|
---|
176 | else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )
|
---|
177 | {
|
---|
178 | i++;
|
---|
179 | outputPrefix = args[i];
|
---|
180 | }
|
---|
181 | else if( args[i].equals( COLOR ) )
|
---|
182 | {
|
---|
183 | i++;
|
---|
184 | color = args[i];
|
---|
185 | }
|
---|
186 | else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )
|
---|
187 | {
|
---|
188 | i++;
|
---|
189 | dpi = Integer.parseInt(args[i]);
|
---|
190 | }
|
---|
191 | else if( args[i].equals( CROPBOX ) )
|
---|
192 | {
|
---|
193 | i++;
|
---|
194 | cropBoxLowerLeftX = Float.valueOf(args[i]);
|
---|
195 | i++;
|
---|
196 | cropBoxLowerLeftY = Float.valueOf(args[i]);
|
---|
197 | i++;
|
---|
198 | cropBoxUpperRightX = Float.valueOf(args[i]);
|
---|
199 | i++;
|
---|
200 | cropBoxUpperRightY = Float.valueOf(args[i]);
|
---|
201 | }
|
---|
202 | else if( args[i].equals( TIME ) )
|
---|
203 | {
|
---|
204 | showTime = true;
|
---|
205 | }
|
---|
206 | else
|
---|
207 | {
|
---|
208 | if( pdfFile == null )
|
---|
209 | {
|
---|
210 | pdfFile = args[i];
|
---|
211 | }
|
---|
212 | }
|
---|
213 | }
|
---|
214 | if( pdfFile == null )
|
---|
215 | {
|
---|
216 | usage();
|
---|
217 | }
|
---|
218 | else
|
---|
219 | {
|
---|
220 | if(outputPrefix == null)
|
---|
221 | {
|
---|
222 | outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
|
---|
223 | }
|
---|
224 |
|
---|
225 | PDDocument document = null;
|
---|
226 | try
|
---|
227 | {
|
---|
228 | boolean extractingTextAllowed = true;
|
---|
229 | //String outputFile = null;
|
---|
230 |
|
---|
231 | /*startProcessing("Loading PDF "+pdfFile);
|
---|
232 | if( outputFile == null && pdfFile.length() >4 )
|
---|
233 | {
|
---|
234 | outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
|
---|
235 | }*/
|
---|
236 |
|
---|
237 | document = PDDocument.load(new File(pdfFile), password);
|
---|
238 |
|
---|
239 | AccessPermission ap = document.getCurrentAccessPermission();
|
---|
240 | if( ! ap.canExtractContent() )
|
---|
241 | {
|
---|
242 | //throw new IOException( "You do not have permission to extract text" );
|
---|
243 | System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
|
---|
244 | extractingTextAllowed = false;
|
---|
245 | }
|
---|
246 | //stopProcessing("Time for loading: ", startTime);
|
---|
247 |
|
---|
248 | // don't extract to HTML in this class, just extract to txt
|
---|
249 | PDFTextStripper stripper = new PDFTextStripper();
|
---|
250 | //stripper.setSortByPosition( sort );
|
---|
251 | //stripper.setShouldSeparateByBeads( separateBeads );
|
---|
252 | stripper.setShouldSeparateByBeads( true );
|
---|
253 |
|
---|
254 |
|
---|
255 | ImageType imageType = null;
|
---|
256 | if ("bilevel".equalsIgnoreCase(color))
|
---|
257 | {
|
---|
258 | imageType = ImageType.BINARY;
|
---|
259 | }
|
---|
260 | else if ("gray".equalsIgnoreCase(color))
|
---|
261 | {
|
---|
262 | imageType = ImageType.GRAY;
|
---|
263 | }
|
---|
264 | else if ("rgb".equalsIgnoreCase(color))
|
---|
265 | {
|
---|
266 | imageType = ImageType.RGB;
|
---|
267 | }
|
---|
268 | else if ("rgba".equalsIgnoreCase(color))
|
---|
269 | {
|
---|
270 | imageType = ImageType.ARGB;
|
---|
271 | }
|
---|
272 |
|
---|
273 | if (imageType == null)
|
---|
274 | {
|
---|
275 | System.err.println( "Error: Invalid color." );
|
---|
276 | System.exit( 2 );
|
---|
277 | }
|
---|
278 |
|
---|
279 | //if a CropBox has been specified, update the CropBox:
|
---|
280 | //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
|
---|
281 | if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0
|
---|
282 | || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
|
---|
283 | {
|
---|
284 | changeCropBox(document,
|
---|
285 | cropBoxLowerLeftX, cropBoxLowerLeftY,
|
---|
286 | cropBoxUpperRightX, cropBoxUpperRightY);
|
---|
287 | }
|
---|
288 |
|
---|
289 | long startTime = System.nanoTime();
|
---|
290 |
|
---|
291 | // render the pages
|
---|
292 | boolean success = true;
|
---|
293 | endPage = Math.min(endPage, document.getNumberOfPages());
|
---|
294 | PDFRenderer renderer = new PDFRenderer(document);
|
---|
295 | for (int i = startPage - 1; i < endPage; i++)
|
---|
296 | {
|
---|
297 | // turn page into image
|
---|
298 | BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
|
---|
299 | int lastSlash = outputPrefix.lastIndexOf(File.separator);
|
---|
300 | outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
|
---|
301 | String fileName = outputPrefix + (i + 1) + ".";
|
---|
302 | success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
|
---|
303 |
|
---|
304 |
|
---|
305 | // image version of page done, now extract text from current page
|
---|
306 | if(extractingTextAllowed) {
|
---|
307 | Writer output = null;
|
---|
308 | try {
|
---|
309 | output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
|
---|
310 | stripper.setStartPage( i+1 );
|
---|
311 | stripper.setEndPage( i+1 );
|
---|
312 |
|
---|
313 | //if (debug)
|
---|
314 | //{
|
---|
315 | System.err.println("Writing to "+fileName);
|
---|
316 | //}
|
---|
317 |
|
---|
318 | // Extract text for main document, the specified pages
|
---|
319 | stripper.writeText( document, output );
|
---|
320 | } catch (Exception ex) {
|
---|
321 | System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
|
---|
322 | } finally {
|
---|
323 | IOUtils.closeQuietly(output);
|
---|
324 | }
|
---|
325 | }
|
---|
326 | }
|
---|
327 |
|
---|
328 | // GS NOTE: We just extracted text for (each page of) the main document, but
|
---|
329 | // we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
|
---|
330 |
|
---|
331 | // performance stats
|
---|
332 | long endTime = System.nanoTime();
|
---|
333 | long duration = endTime - startTime;
|
---|
334 | int count = 1 + endPage - startPage;
|
---|
335 | if (showTime)
|
---|
336 | {
|
---|
337 | System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
|
---|
338 | duration / 1000000);
|
---|
339 | }
|
---|
340 |
|
---|
341 | if (!success)
|
---|
342 | {
|
---|
343 | System.err.println( "Error: no writer found for image format '"
|
---|
344 | + imageFormat + "'" );
|
---|
345 | System.exit(1);
|
---|
346 | }
|
---|
347 | }
|
---|
348 | finally
|
---|
349 | {
|
---|
350 | if( document != null )
|
---|
351 | {
|
---|
352 | document.close();
|
---|
353 | }
|
---|
354 | }
|
---|
355 | }
|
---|
356 | }
|
---|
357 |
|
---|
358 | /**
|
---|
359 | * This will print the usage requirements and exit.
|
---|
360 | */
|
---|
361 | private static void usage()
|
---|
362 | {
|
---|
363 | String message = "Usage: java -jar pdfbox-app-x.y.z.jar GS_PDFToImagesAndText [options] <inputfile>\n"
|
---|
364 | + "\nOptions:\n"
|
---|
365 | + " -password <password> : Password to decrypt document\n"
|
---|
366 | + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
|
---|
367 | + " -format <string> : Image format: " + getImageFormats() + "\n"
|
---|
368 | + " -prefix <string> : Filename prefix for image files\n"
|
---|
369 | + " -page <number> : The only page to extract (1-based)\n"
|
---|
370 | + " -startPage <int> : The first page to start extraction (1-based)\n"
|
---|
371 | + " -endPage <int> : The last page to extract(inclusive)\n"
|
---|
372 | + " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
|
---|
373 | + " -dpi <int> : The DPI of the output image\n"
|
---|
374 | + " -cropbox <int> <int> <int> <int> : The page area to export\n"
|
---|
375 | + " -time : Prints timing information to stdout\n"
|
---|
376 | + " <inputfile> : The PDF document to use\n";
|
---|
377 |
|
---|
378 | System.err.println(message);
|
---|
379 | System.exit( 1 );
|
---|
380 | }
|
---|
381 |
|
---|
382 | private static String getImageFormats()
|
---|
383 | {
|
---|
384 | StringBuilder retval = new StringBuilder();
|
---|
385 | String[] formats = ImageIO.getReaderFormatNames();
|
---|
386 | for( int i = 0; i < formats.length; i++ )
|
---|
387 | {
|
---|
388 | if (formats[i].equalsIgnoreCase(formats[i]))
|
---|
389 | {
|
---|
390 | retval.append( formats[i] );
|
---|
391 | if( i + 1 < formats.length )
|
---|
392 | {
|
---|
393 | retval.append( ", " );
|
---|
394 | }
|
---|
395 | }
|
---|
396 | }
|
---|
397 | return retval.toString();
|
---|
398 | }
|
---|
399 |
|
---|
400 | private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
|
---|
401 | {
|
---|
402 | for (PDPage page : document.getPages())
|
---|
403 | {
|
---|
404 | System.out.println("resizing page");
|
---|
405 | PDRectangle rectangle = new PDRectangle();
|
---|
406 | rectangle.setLowerLeftX(a);
|
---|
407 | rectangle.setLowerLeftY(b);
|
---|
408 | rectangle.setUpperRightX(c);
|
---|
409 | rectangle.setUpperRightY(d);
|
---|
410 | page.setCropBox(rectangle);
|
---|
411 |
|
---|
412 | }
|
---|
413 | }
|
---|
414 | }
|
---|