source: gs2-extensions/pdf-box/trunk/java/src/org/greenstone/pdfbox/PDFBoxToImagesAndText.java@ 32278

Last change on this file since 32278 was 32278, checked in by ak19, 6 years ago

Our custom pdf-box class PDFToImagesAndText.java now takes two additional flags, textOnly and imagesOnly, which can be used to support paged_text and the original pagedimg_ output formats, besides pagedimgtxt_

File size: 16.3 KB
Line 
1/**********************************************************************
2 *
3 * PDFBoxToImagesAndText.java based on Apache PDFBox®'s PDFToImage.java
4 * with further code spliced in from its ExtractImages.java with some
5 * minor modifications.
6 *
7 * The code in this file is therefore under the same Apache License
8 * version 2.0 as Apache's PDFBox.
9 *
10 * Copyright 2018 The New Zealand Digital Library Project
11 *
12 * A component of the Greenstone digital library software
13 * from the New Zealand Digital Library Project at the
14 * University of Waikato, New Zealand.
15 *
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the Apache License version 2.0.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the Apache License version 2.0
25 * along with this file; if not, refer to
26 * https://www.apache.org/licenses/LICENSE-2.0.
27 *
28 * The following comment is from the original file,
29 * PDFBox's PDFToImage.java
30 *
31 *********************************************************************/
32/*
33 * Licensed to the Apache Software Foundation (ASF) under one or more
34 * contributor license agreements. See the NOTICE file distributed with
35 * this work for additional information regarding copyright ownership.
36 * The ASF licenses this file to You under the Apache License, Version 2.0
37 * (the "License"); you may not use this file except in compliance with
38 * the License. You may obtain a copy of the License at
39 *
40 * http://www.apache.org/licenses/LICENSE-2.0
41 *
42 * Unless required by applicable law or agreed to in writing, software
43 * distributed under the License is distributed on an "AS IS" BASIS,
44 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
45 * See the License for the specific language governing permissions and
46 * limitations under the License.
47 */
48package org.greenstone.pdfbox;
49
50import java.awt.HeadlessException;
51import java.awt.Toolkit;
52import java.awt.image.BufferedImage;
53import java.io.File;
54import java.io.FileOutputStream;
55import java.io.IOException;
56import java.io.OutputStreamWriter;
57import java.io.Writer;
58import org.apache.pdfbox.io.IOUtils;
59
60import javax.imageio.ImageIO;
61
62import org.apache.pdfbox.pdmodel.PDDocument;
63import org.apache.pdfbox.pdmodel.PDPage;
64import org.apache.pdfbox.pdmodel.common.PDRectangle;
65import org.apache.pdfbox.rendering.ImageType;
66import org.apache.pdfbox.rendering.PDFRenderer;
67import org.apache.pdfbox.tools.imageio.ImageIOUtil;
68
69import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
70import org.apache.pdfbox.text.PDFTextStripper;
71
72/**
73 * This class is based on PDFToImage.java which converts
74 * the pages of a PDF document to images.
75 * This class should convert the pages to images and
76 * extract the text of each page. The latter part of the code
77 * is taken from ExtractText.java.
78 * Variables textOnly and imagesOnly determine which aspect
79 * is output for each page, or whether both an image and text
80 * are output per page.
81 *
82 * Built on Apache PDFBox's PDFToImage.java with minor modifications.
83 * ak19
84 */
85public final class PDFBoxToImagesAndText
86{
87 private static final String PASSWORD = "-password";
88 private static final String ENCODING = "-encoding";
89 private static final String START_PAGE = "-startPage";
90 private static final String END_PAGE = "-endPage";
91 private static final String PAGE = "-page";
92 private static final String IMAGE_TYPE = "-imageType";
93 private static final String FORMAT = "-format";
94 private static final String OUTPUT_PREFIX = "-outputPrefix";
95 private static final String PREFIX = "-prefix";
96 private static final String COLOR = "-color";
97 private static final String RESOLUTION = "-resolution";
98 private static final String DPI = "-dpi";
99 private static final String CROPBOX = "-cropbox";
100 private static final String TIME = "-time";
101 private static final String TEXT_ONLY = "-textOnly"; // output just the text per page
102 private static final String IMAGES_ONLY = "-imagesOnly"; // output just an image per page
103
104 private static final String STD_ENCODING = "UTF-8";
105
106 /**
107 * private constructor.
108 */
109 private PDFBoxToImagesAndText()
110 {
111 //static class
112 }
113
114 /**
115 * Infamous main method.
116 *
117 * @param args Command line arguments, should be one and a reference to a file.
118 *
119 * @throws IOException If there is an error parsing the document.
120 */
121 public static void main( String[] args ) throws IOException
122 {
123 try
124 {
125 // force KCMS (faster than LCMS) if available
126 Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
127 System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
128 }
129 catch (ClassNotFoundException e)
130 {
131 // do nothing
132 }
133
134 // suppress the Dock icon on OS X
135 System.setProperty("apple.awt.UIElement", "true");
136
137 String password = "";
138 String encoding = STD_ENCODING;
139 String pdfFile = null;
140 String outputPrefix = null;
141 String imageFormat = "jpg";
142 int startPage = 1;
143 int endPage = Integer.MAX_VALUE;
144 String color = "rgb";
145 int dpi;
146 float cropBoxLowerLeftX = 0;
147 float cropBoxLowerLeftY = 0;
148 float cropBoxUpperRightX = 0;
149 float cropBoxUpperRightY = 0;
150 boolean showTime = false;
151 boolean textOnly = false;
152 boolean imagesOnly = false;
153
154 try
155 {
156 dpi = Toolkit.getDefaultToolkit().getScreenResolution();
157 }
158 catch( HeadlessException e )
159 {
160 dpi = 96;
161 }
162 for( int i = 0; i < args.length; i++ )
163 {
164 if( args[i].equals( PASSWORD ) )
165 {
166 i++;
167 if( i >= args.length )
168 {
169 usage();
170 }
171 password = args[i];
172 }
173 else if( args[i].equals( ENCODING ) )
174 {
175 i++;
176 if( i >= args.length )
177 {
178 usage();
179 }
180 encoding = args[i];
181 }
182 else if( args[i].equals( START_PAGE ) )
183 {
184 i++;
185 if( i >= args.length )
186 {
187 usage();
188 }
189 startPage = Integer.parseInt( args[i] );
190 }
191 else if( args[i].equals( END_PAGE ) )
192 {
193 i++;
194 if( i >= args.length )
195 {
196 usage();
197 }
198 endPage = Integer.parseInt( args[i] );
199 }
200 else if( args[i].equals( PAGE ) )
201 {
202 i++;
203 if( i >= args.length )
204 {
205 usage();
206 }
207 startPage = Integer.parseInt( args[i] );
208 endPage = Integer.parseInt( args[i] );
209 }
210 else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )
211 {
212 i++;
213 imageFormat = args[i];
214 }
215 else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )
216 {
217 i++;
218 outputPrefix = args[i];
219 }
220 else if( args[i].equals( COLOR ) )
221 {
222 i++;
223 color = args[i];
224 }
225 else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )
226 {
227 i++;
228 dpi = Integer.parseInt(args[i]);
229 }
230 else if( args[i].equals( CROPBOX ) )
231 {
232 i++;
233 cropBoxLowerLeftX = Float.valueOf(args[i]);
234 i++;
235 cropBoxLowerLeftY = Float.valueOf(args[i]);
236 i++;
237 cropBoxUpperRightX = Float.valueOf(args[i]);
238 i++;
239 cropBoxUpperRightY = Float.valueOf(args[i]);
240 }
241 else if( args[i].equals( TEXT_ONLY ) )
242 {
243 textOnly = true;
244 }
245 else if( args[i].equals( IMAGES_ONLY ) )
246 {
247 imagesOnly = true;
248 }
249 else if( args[i].equals( TIME ) )
250 {
251 showTime = true;
252 }
253 else
254 {
255 if( pdfFile == null )
256 {
257 pdfFile = args[i];
258 }
259 }
260 }
261 if( pdfFile == null )
262 {
263 usage();
264 }
265 else
266 {
267 if(outputPrefix == null)
268 {
269 outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
270 }
271
272 PDDocument document = null;
273 try
274 {
275 boolean extractingTextAllowed = true;
276 //String outputFile = null;
277
278 /*startProcessing("Loading PDF "+pdfFile);
279 if( outputFile == null && pdfFile.length() >4 )
280 {
281 outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
282 }*/
283
284 document = PDDocument.load(new File(pdfFile), password);
285
286 AccessPermission ap = document.getCurrentAccessPermission();
287 if( ! ap.canExtractContent() )
288 {
289 //throw new IOException( "You do not have permission to extract text" );
290 System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
291 extractingTextAllowed = false;
292 }
293 //stopProcessing("Time for loading: ", startTime);
294
295 // don't extract to HTML in this class, just extract to txt
296 PDFTextStripper stripper = new PDFTextStripper();
297 //stripper.setSortByPosition( sort );
298 //stripper.setShouldSeparateByBeads( separateBeads );
299 stripper.setShouldSeparateByBeads( true );
300
301
302 ImageType imageType = null;
303 if ("bilevel".equalsIgnoreCase(color))
304 {
305 imageType = ImageType.BINARY;
306 }
307 else if ("gray".equalsIgnoreCase(color))
308 {
309 imageType = ImageType.GRAY;
310 }
311 else if ("rgb".equalsIgnoreCase(color))
312 {
313 imageType = ImageType.RGB;
314 }
315 else if ("rgba".equalsIgnoreCase(color))
316 {
317 imageType = ImageType.ARGB;
318 }
319
320 if (imageType == null)
321 {
322 System.err.println( "Error: Invalid color." );
323 System.exit( 2 );
324 }
325
326 //if a CropBox has been specified, update the CropBox:
327 //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
328 if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0
329 || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
330 {
331 changeCropBox(document,
332 cropBoxLowerLeftX, cropBoxLowerLeftY,
333 cropBoxUpperRightX, cropBoxUpperRightY);
334 }
335
336 long startTime = System.nanoTime();
337
338 // render the pages
339 boolean success = true;
340 endPage = Math.min(endPage, document.getNumberOfPages());
341 PDFRenderer renderer = new PDFRenderer(document);
342 for (int i = startPage - 1; i < endPage; i++)
343 {
344 int lastSlash = outputPrefix.lastIndexOf(File.separator);
345 outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
346 String fileName = outputPrefix + (i + 1) + ".";
347
348 if(!textOnly) {
349 // turn page into image
350 BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
351 success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
352 }
353
354 // image version of page done, now extract text from current page
355 if(!imagesOnly && extractingTextAllowed) {
356 Writer output = null;
357 try {
358 output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
359 stripper.setStartPage( i+1 );
360 stripper.setEndPage( i+1 );
361
362 //if (debug)
363 //{
364 System.err.println("Writing to "+fileName);
365 //}
366
367 // Extract text for main document, the specified pages
368 stripper.writeText( document, output );
369 } catch (Exception ex) {
370 System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
371 } finally {
372 IOUtils.closeQuietly(output);
373 }
374 }
375 }
376
377 // GS NOTE: We just extracted text for (each page of) the main document, but
378 // we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
379
380 // performance stats
381 long endTime = System.nanoTime();
382 long duration = endTime - startTime;
383 int count = 1 + endPage - startPage;
384 if (showTime)
385 {
386 System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
387 duration / 1000000);
388 }
389
390 if (!success)
391 {
392 System.err.println( "Error: no writer found for image format '"
393 + imageFormat + "'" );
394 System.exit(1);
395 }
396 }
397 finally
398 {
399 if( document != null )
400 {
401 document.close();
402 }
403 }
404 }
405 }
406
407 /**
408 * This will print the usage requirements and exit.
409 */
410 private static void usage()
411 {
412 String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFBoxToImagesAndText [options] <inputfile>\n"
413 + "\nOptions:\n"
414 + " -password <password> : Password to decrypt document\n"
415 + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
416 + " -format <string> : Image format: " + getImageFormats() + "\n"
417 + " -prefix <string> : Filename prefix for image files\n"
418 + " -page <number> : The only page to extract (1-based)\n"
419 + " -startPage <int> : The first page to start extraction (1-based)\n"
420 + " -endPage <int> : The last page to extract(inclusive)\n"
421 + " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
422 + " -dpi <int> : The DPI of the output image\n"
423 + " -cropbox <int> <int> <int> <int> : The page area to export\n"
424 + " -time : Prints timing information to stdout\n"
425 + " <inputfile> : The PDF document to use\n";
426
427 System.err.println(message);
428 System.exit( 1 );
429 }
430
431 private static String getImageFormats()
432 {
433 StringBuilder retval = new StringBuilder();
434 String[] formats = ImageIO.getReaderFormatNames();
435 for( int i = 0; i < formats.length; i++ )
436 {
437 if (formats[i].equalsIgnoreCase(formats[i]))
438 {
439 retval.append( formats[i] );
440 if( i + 1 < formats.length )
441 {
442 retval.append( ", " );
443 }
444 }
445 }
446 return retval.toString();
447 }
448
449 private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
450 {
451 for (PDPage page : document.getPages())
452 {
453 System.out.println("resizing page");
454 PDRectangle rectangle = new PDRectangle();
455 rectangle.setLowerLeftX(a);
456 rectangle.setLowerLeftY(b);
457 rectangle.setUpperRightX(c);
458 rectangle.setUpperRightY(d);
459 page.setCropBox(rectangle);
460
461 }
462 }
463}
Note: See TracBrowser for help on using the repository browser.