1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * PDFBoxToImagesAndText.java based on Apache PDFBox®'s PDFToImage.java
|
---|
4 | * with further code spliced in from its ExtractImages.java with some
|
---|
5 | * minor modifications.
|
---|
6 | *
|
---|
7 | * The code in this file is therefore under the same Apache License
|
---|
8 | * version 2.0 as Apache's PDFBox.
|
---|
9 | *
|
---|
10 | * Copyright 2018 The New Zealand Digital Library Project
|
---|
11 | *
|
---|
12 | * A component of the Greenstone digital library software
|
---|
13 | * from the New Zealand Digital Library Project at the
|
---|
14 | * University of Waikato, New Zealand.
|
---|
15 | *
|
---|
16 | * This program is free software; you can redistribute it and/or modify
|
---|
17 | * it under the terms of the Apache License version 2.0.
|
---|
18 | *
|
---|
19 | * This program is distributed in the hope that it will be useful,
|
---|
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
22 | * GNU General Public License for more details.
|
---|
23 | *
|
---|
24 | * You should have received a copy of the Apache License version 2.0
|
---|
25 | * along with this file; if not, refer to
|
---|
26 | * https://www.apache.org/licenses/LICENSE-2.0.
|
---|
27 | *
|
---|
28 | * The following comment is from the original file,
|
---|
29 | * PDFBox's PDFToImage.java
|
---|
30 | *
|
---|
31 | *********************************************************************/
|
---|
32 | /*
|
---|
33 | * Licensed to the Apache Software Foundation (ASF) under one or more
|
---|
34 | * contributor license agreements. See the NOTICE file distributed with
|
---|
35 | * this work for additional information regarding copyright ownership.
|
---|
36 | * The ASF licenses this file to You under the Apache License, Version 2.0
|
---|
37 | * (the "License"); you may not use this file except in compliance with
|
---|
38 | * the License. You may obtain a copy of the License at
|
---|
39 | *
|
---|
40 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
41 | *
|
---|
42 | * Unless required by applicable law or agreed to in writing, software
|
---|
43 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
44 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
45 | * See the License for the specific language governing permissions and
|
---|
46 | * limitations under the License.
|
---|
47 | */
|
---|
48 | package org.greenstone.pdfbox;
|
---|
49 |
|
---|
50 | import java.awt.HeadlessException;
|
---|
51 | import java.awt.Toolkit;
|
---|
52 | import java.awt.image.BufferedImage;
|
---|
53 | import java.io.File;
|
---|
54 | import java.io.FileOutputStream;
|
---|
55 | import java.io.IOException;
|
---|
56 | import java.io.OutputStreamWriter;
|
---|
57 | import java.io.Writer;
|
---|
58 | import org.apache.pdfbox.io.IOUtils;
|
---|
59 |
|
---|
60 | import javax.imageio.ImageIO;
|
---|
61 |
|
---|
62 | import org.apache.pdfbox.pdmodel.PDDocument;
|
---|
63 | import org.apache.pdfbox.pdmodel.PDPage;
|
---|
64 | import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
---|
65 | import org.apache.pdfbox.rendering.ImageType;
|
---|
66 | import org.apache.pdfbox.rendering.PDFRenderer;
|
---|
67 | import org.apache.pdfbox.tools.imageio.ImageIOUtil;
|
---|
68 |
|
---|
69 | import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
|
---|
70 | import org.apache.pdfbox.text.PDFTextStripper;
|
---|
71 |
|
---|
72 | /**
|
---|
73 | * This class is based on PDFToImage.java which converts
|
---|
74 | * the pages of a PDF document to images.
|
---|
75 | * This class should convert the pages to images and
|
---|
76 | * extract the text of each page. The latter part of the code
|
---|
77 | * is taken from ExtractText.java.
|
---|
78 | * Variables textOnly and imagesOnly determine which aspect
|
---|
79 | * is output for each page, or whether both an image and text
|
---|
80 | * are output per page.
|
---|
81 | *
|
---|
82 | * Built on Apache PDFBox's PDFToImage.java with minor modifications.
|
---|
83 | * ak19
|
---|
84 | */
|
---|
85 | public final class PDFBoxToImagesAndText
|
---|
86 | {
|
---|
87 | private static final String PASSWORD = "-password";
|
---|
88 | private static final String ENCODING = "-encoding";
|
---|
89 | private static final String START_PAGE = "-startPage";
|
---|
90 | private static final String END_PAGE = "-endPage";
|
---|
91 | private static final String PAGE = "-page";
|
---|
92 | private static final String IMAGE_TYPE = "-imageType";
|
---|
93 | private static final String FORMAT = "-format";
|
---|
94 | private static final String OUTPUT_PREFIX = "-outputPrefix";
|
---|
95 | private static final String PREFIX = "-prefix";
|
---|
96 | private static final String COLOR = "-color";
|
---|
97 | private static final String RESOLUTION = "-resolution";
|
---|
98 | private static final String DPI = "-dpi";
|
---|
99 | private static final String CROPBOX = "-cropbox";
|
---|
100 | private static final String TIME = "-time";
|
---|
101 | private static final String TEXT_ONLY = "-textOnly"; // output just the text per page
|
---|
102 | private static final String IMAGES_ONLY = "-imagesOnly"; // output just an image per page
|
---|
103 |
|
---|
104 | private static final String STD_ENCODING = "UTF-8";
|
---|
105 |
|
---|
106 | /**
|
---|
107 | * private constructor.
|
---|
108 | */
|
---|
109 | private PDFBoxToImagesAndText()
|
---|
110 | {
|
---|
111 | //static class
|
---|
112 | }
|
---|
113 |
|
---|
114 | /**
|
---|
115 | * Infamous main method.
|
---|
116 | *
|
---|
117 | * @param args Command line arguments, should be one and a reference to a file.
|
---|
118 | *
|
---|
119 | * @throws IOException If there is an error parsing the document.
|
---|
120 | */
|
---|
121 | public static void main( String[] args ) throws IOException
|
---|
122 | {
|
---|
123 | try
|
---|
124 | {
|
---|
125 | // force KCMS (faster than LCMS) if available
|
---|
126 | Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
|
---|
127 | System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
|
---|
128 | }
|
---|
129 | catch (ClassNotFoundException e)
|
---|
130 | {
|
---|
131 | // do nothing
|
---|
132 | }
|
---|
133 |
|
---|
134 | // suppress the Dock icon on OS X
|
---|
135 | System.setProperty("apple.awt.UIElement", "true");
|
---|
136 |
|
---|
137 | String password = "";
|
---|
138 | String encoding = STD_ENCODING;
|
---|
139 | String pdfFile = null;
|
---|
140 | String outputPrefix = null;
|
---|
141 | String imageFormat = "jpg";
|
---|
142 | int startPage = 1;
|
---|
143 | int endPage = Integer.MAX_VALUE;
|
---|
144 | String color = "rgb";
|
---|
145 | int dpi;
|
---|
146 | float cropBoxLowerLeftX = 0;
|
---|
147 | float cropBoxLowerLeftY = 0;
|
---|
148 | float cropBoxUpperRightX = 0;
|
---|
149 | float cropBoxUpperRightY = 0;
|
---|
150 | boolean showTime = false;
|
---|
151 | boolean textOnly = false;
|
---|
152 | boolean imagesOnly = false;
|
---|
153 |
|
---|
154 | try
|
---|
155 | {
|
---|
156 | dpi = Toolkit.getDefaultToolkit().getScreenResolution();
|
---|
157 | }
|
---|
158 | catch( HeadlessException e )
|
---|
159 | {
|
---|
160 | dpi = 96;
|
---|
161 | }
|
---|
162 | for( int i = 0; i < args.length; i++ )
|
---|
163 | {
|
---|
164 | if( args[i].equals( PASSWORD ) )
|
---|
165 | {
|
---|
166 | i++;
|
---|
167 | if( i >= args.length )
|
---|
168 | {
|
---|
169 | usage();
|
---|
170 | }
|
---|
171 | password = args[i];
|
---|
172 | }
|
---|
173 | else if( args[i].equals( ENCODING ) )
|
---|
174 | {
|
---|
175 | i++;
|
---|
176 | if( i >= args.length )
|
---|
177 | {
|
---|
178 | usage();
|
---|
179 | }
|
---|
180 | encoding = args[i];
|
---|
181 | }
|
---|
182 | else if( args[i].equals( START_PAGE ) )
|
---|
183 | {
|
---|
184 | i++;
|
---|
185 | if( i >= args.length )
|
---|
186 | {
|
---|
187 | usage();
|
---|
188 | }
|
---|
189 | startPage = Integer.parseInt( args[i] );
|
---|
190 | }
|
---|
191 | else if( args[i].equals( END_PAGE ) )
|
---|
192 | {
|
---|
193 | i++;
|
---|
194 | if( i >= args.length )
|
---|
195 | {
|
---|
196 | usage();
|
---|
197 | }
|
---|
198 | endPage = Integer.parseInt( args[i] );
|
---|
199 | }
|
---|
200 | else if( args[i].equals( PAGE ) )
|
---|
201 | {
|
---|
202 | i++;
|
---|
203 | if( i >= args.length )
|
---|
204 | {
|
---|
205 | usage();
|
---|
206 | }
|
---|
207 | startPage = Integer.parseInt( args[i] );
|
---|
208 | endPage = Integer.parseInt( args[i] );
|
---|
209 | }
|
---|
210 | else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )
|
---|
211 | {
|
---|
212 | i++;
|
---|
213 | imageFormat = args[i];
|
---|
214 | }
|
---|
215 | else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )
|
---|
216 | {
|
---|
217 | i++;
|
---|
218 | outputPrefix = args[i];
|
---|
219 | }
|
---|
220 | else if( args[i].equals( COLOR ) )
|
---|
221 | {
|
---|
222 | i++;
|
---|
223 | color = args[i];
|
---|
224 | }
|
---|
225 | else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )
|
---|
226 | {
|
---|
227 | i++;
|
---|
228 | dpi = Integer.parseInt(args[i]);
|
---|
229 | }
|
---|
230 | else if( args[i].equals( CROPBOX ) )
|
---|
231 | {
|
---|
232 | i++;
|
---|
233 | cropBoxLowerLeftX = Float.valueOf(args[i]);
|
---|
234 | i++;
|
---|
235 | cropBoxLowerLeftY = Float.valueOf(args[i]);
|
---|
236 | i++;
|
---|
237 | cropBoxUpperRightX = Float.valueOf(args[i]);
|
---|
238 | i++;
|
---|
239 | cropBoxUpperRightY = Float.valueOf(args[i]);
|
---|
240 | }
|
---|
241 | else if( args[i].equals( TEXT_ONLY ) )
|
---|
242 | {
|
---|
243 | textOnly = true;
|
---|
244 | }
|
---|
245 | else if( args[i].equals( IMAGES_ONLY ) )
|
---|
246 | {
|
---|
247 | imagesOnly = true;
|
---|
248 | }
|
---|
249 | else if( args[i].equals( TIME ) )
|
---|
250 | {
|
---|
251 | showTime = true;
|
---|
252 | }
|
---|
253 | else
|
---|
254 | {
|
---|
255 | if( pdfFile == null )
|
---|
256 | {
|
---|
257 | pdfFile = args[i];
|
---|
258 | }
|
---|
259 | }
|
---|
260 | }
|
---|
261 | if( pdfFile == null )
|
---|
262 | {
|
---|
263 | usage();
|
---|
264 | }
|
---|
265 | else
|
---|
266 | {
|
---|
267 | if(outputPrefix == null)
|
---|
268 | {
|
---|
269 | outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
|
---|
270 | }
|
---|
271 |
|
---|
272 | PDDocument document = null;
|
---|
273 | try
|
---|
274 | {
|
---|
275 | boolean extractingTextAllowed = true;
|
---|
276 | //String outputFile = null;
|
---|
277 |
|
---|
278 | /*startProcessing("Loading PDF "+pdfFile);
|
---|
279 | if( outputFile == null && pdfFile.length() >4 )
|
---|
280 | {
|
---|
281 | outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
|
---|
282 | }*/
|
---|
283 |
|
---|
284 | document = PDDocument.load(new File(pdfFile), password);
|
---|
285 |
|
---|
286 | AccessPermission ap = document.getCurrentAccessPermission();
|
---|
287 | if( ! ap.canExtractContent() )
|
---|
288 | {
|
---|
289 | //throw new IOException( "You do not have permission to extract text" );
|
---|
290 | System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
|
---|
291 | extractingTextAllowed = false;
|
---|
292 | }
|
---|
293 | //stopProcessing("Time for loading: ", startTime);
|
---|
294 |
|
---|
295 | // don't extract to HTML in this class, just extract to txt
|
---|
296 | PDFTextStripper stripper = new PDFTextStripper();
|
---|
297 | //stripper.setSortByPosition( sort );
|
---|
298 | //stripper.setShouldSeparateByBeads( separateBeads );
|
---|
299 | stripper.setShouldSeparateByBeads( true );
|
---|
300 |
|
---|
301 |
|
---|
302 | ImageType imageType = null;
|
---|
303 | if ("bilevel".equalsIgnoreCase(color))
|
---|
304 | {
|
---|
305 | imageType = ImageType.BINARY;
|
---|
306 | }
|
---|
307 | else if ("gray".equalsIgnoreCase(color))
|
---|
308 | {
|
---|
309 | imageType = ImageType.GRAY;
|
---|
310 | }
|
---|
311 | else if ("rgb".equalsIgnoreCase(color))
|
---|
312 | {
|
---|
313 | imageType = ImageType.RGB;
|
---|
314 | }
|
---|
315 | else if ("rgba".equalsIgnoreCase(color))
|
---|
316 | {
|
---|
317 | imageType = ImageType.ARGB;
|
---|
318 | }
|
---|
319 |
|
---|
320 | if (imageType == null)
|
---|
321 | {
|
---|
322 | System.err.println( "Error: Invalid color." );
|
---|
323 | System.exit( 2 );
|
---|
324 | }
|
---|
325 |
|
---|
326 | //if a CropBox has been specified, update the CropBox:
|
---|
327 | //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
|
---|
328 | if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0
|
---|
329 | || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
|
---|
330 | {
|
---|
331 | changeCropBox(document,
|
---|
332 | cropBoxLowerLeftX, cropBoxLowerLeftY,
|
---|
333 | cropBoxUpperRightX, cropBoxUpperRightY);
|
---|
334 | }
|
---|
335 |
|
---|
336 | long startTime = System.nanoTime();
|
---|
337 |
|
---|
338 | // render the pages
|
---|
339 | boolean success = true;
|
---|
340 | endPage = Math.min(endPage, document.getNumberOfPages());
|
---|
341 | PDFRenderer renderer = new PDFRenderer(document);
|
---|
342 | for (int i = startPage - 1; i < endPage; i++)
|
---|
343 | {
|
---|
344 | int lastSlash = outputPrefix.lastIndexOf(File.separator);
|
---|
345 | outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
|
---|
346 | String fileName = outputPrefix + (i + 1) + ".";
|
---|
347 |
|
---|
348 | if(!textOnly) {
|
---|
349 | // turn page into image
|
---|
350 | BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
|
---|
351 | success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
|
---|
352 | }
|
---|
353 |
|
---|
354 | // image version of page done, now extract text from current page
|
---|
355 | if(!imagesOnly && extractingTextAllowed) {
|
---|
356 | Writer output = null;
|
---|
357 | try {
|
---|
358 | output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
|
---|
359 | stripper.setStartPage( i+1 );
|
---|
360 | stripper.setEndPage( i+1 );
|
---|
361 |
|
---|
362 | //if (debug)
|
---|
363 | //{
|
---|
364 | System.err.println("Writing to "+fileName);
|
---|
365 | //}
|
---|
366 |
|
---|
367 | // Extract text for main document, the specified pages
|
---|
368 | stripper.writeText( document, output );
|
---|
369 | } catch (Exception ex) {
|
---|
370 | System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
|
---|
371 | } finally {
|
---|
372 | IOUtils.closeQuietly(output);
|
---|
373 | }
|
---|
374 | }
|
---|
375 | }
|
---|
376 |
|
---|
377 | // GS NOTE: We just extracted text for (each page of) the main document, but
|
---|
378 | // we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
|
---|
379 |
|
---|
380 | // performance stats
|
---|
381 | long endTime = System.nanoTime();
|
---|
382 | long duration = endTime - startTime;
|
---|
383 | int count = 1 + endPage - startPage;
|
---|
384 | if (showTime)
|
---|
385 | {
|
---|
386 | System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
|
---|
387 | duration / 1000000);
|
---|
388 | }
|
---|
389 |
|
---|
390 | if (!success)
|
---|
391 | {
|
---|
392 | System.err.println( "Error: no writer found for image format '"
|
---|
393 | + imageFormat + "'" );
|
---|
394 | System.exit(1);
|
---|
395 | }
|
---|
396 | }
|
---|
397 | finally
|
---|
398 | {
|
---|
399 | if( document != null )
|
---|
400 | {
|
---|
401 | document.close();
|
---|
402 | }
|
---|
403 | }
|
---|
404 | }
|
---|
405 | }
|
---|
406 |
|
---|
407 | /**
|
---|
408 | * This will print the usage requirements and exit.
|
---|
409 | */
|
---|
410 | private static void usage()
|
---|
411 | {
|
---|
412 | String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFBoxToImagesAndText [options] <inputfile>\n"
|
---|
413 | + "\nOptions:\n"
|
---|
414 | + " -password <password> : Password to decrypt document\n"
|
---|
415 | + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
|
---|
416 | + " -format <string> : Image format: " + getImageFormats() + "\n"
|
---|
417 | + " -prefix <string> : Filename prefix for image files\n"
|
---|
418 | + " -page <number> : The only page to extract (1-based)\n"
|
---|
419 | + " -startPage <int> : The first page to start extraction (1-based)\n"
|
---|
420 | + " -endPage <int> : The last page to extract(inclusive)\n"
|
---|
421 | + " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
|
---|
422 | + " -dpi <int> : The DPI of the output image\n"
|
---|
423 | + " -cropbox <int> <int> <int> <int> : The page area to export\n"
|
---|
424 | + " -time : Prints timing information to stdout\n"
|
---|
425 | + " <inputfile> : The PDF document to use\n";
|
---|
426 |
|
---|
427 | System.err.println(message);
|
---|
428 | System.exit( 1 );
|
---|
429 | }
|
---|
430 |
|
---|
431 | private static String getImageFormats()
|
---|
432 | {
|
---|
433 | StringBuilder retval = new StringBuilder();
|
---|
434 | String[] formats = ImageIO.getReaderFormatNames();
|
---|
435 | for( int i = 0; i < formats.length; i++ )
|
---|
436 | {
|
---|
437 | if (formats[i].equalsIgnoreCase(formats[i]))
|
---|
438 | {
|
---|
439 | retval.append( formats[i] );
|
---|
440 | if( i + 1 < formats.length )
|
---|
441 | {
|
---|
442 | retval.append( ", " );
|
---|
443 | }
|
---|
444 | }
|
---|
445 | }
|
---|
446 | return retval.toString();
|
---|
447 | }
|
---|
448 |
|
---|
449 | private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
|
---|
450 | {
|
---|
451 | for (PDPage page : document.getPages())
|
---|
452 | {
|
---|
453 | System.out.println("resizing page");
|
---|
454 | PDRectangle rectangle = new PDRectangle();
|
---|
455 | rectangle.setLowerLeftX(a);
|
---|
456 | rectangle.setLowerLeftY(b);
|
---|
457 | rectangle.setUpperRightX(c);
|
---|
458 | rectangle.setUpperRightY(d);
|
---|
459 | page.setCropBox(rectangle);
|
---|
460 |
|
---|
461 | }
|
---|
462 | }
|
---|
463 | }
|
---|