source: gs2-extensions/pdf-box/trunk/java/src/org/greenstone/pdfbox/PDFBoxToImagesAndText.java@ 32197

Last change on this file since 32197 was 32197, checked in by ak19, 6 years ago

Updates to the recent commit's modifications to do with pdfbox: new class has been renamed from GS_PDFToImagesAndText.java to org/greenstone/pdfbox/PDFBoxToImagesAndText.java and uses a GS package. This class file is no longer included in pdfbox-app.jar, but is just compiled against that. Added Apache v 2.0 licensing related files. PDFBoxConverter.pm now refers to the newly named Java class with the new org.greenstone.pdfbox package name. Updated the Readme to add instructions to do with compiling the new java file and its new folder/package structure, and information related to the Apache license. There's also the new java/build subfolder containing the precompiled class file (and Java pkg structure) for the new class. This new build folder with the new custom class, and the modified PDFBoxConverter.pm and the modified pdfbox-app.jar (without the custom class) are modifications to the pdfbox tarball/zip files too.

File size: 15.7 KB
Line 
1/**********************************************************************
2 *
3 * PDFBoxToImagesAndText.java based on Apache PDFBox®'s PDFToImage.java
4 * with further code spliced in from its ExtractImages.java with some
5 * minor modifications.
6 *
7 * The code in this file is therefore under the same Apache License
8 * version 2.0 as Apache's PDFBox.
9 *
10 * Copyright 2018 The New Zealand Digital Library Project
11 *
12 * A component of the Greenstone digital library software
13 * from the New Zealand Digital Library Project at the
14 * University of Waikato, New Zealand.
15 *
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the Apache License version 2.0.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the Apache License version 2.0
25 * along with this file; if not, refer to
26 * https://www.apache.org/licenses/LICENSE-2.0.
27 *
28 * The following comment is from the original file,
29 * PDFBox's PDFToImage.java
30 *
31 *********************************************************************/
32/*
33 * Licensed to the Apache Software Foundation (ASF) under one or more
34 * contributor license agreements. See the NOTICE file distributed with
35 * this work for additional information regarding copyright ownership.
36 * The ASF licenses this file to You under the Apache License, Version 2.0
37 * (the "License"); you may not use this file except in compliance with
38 * the License. You may obtain a copy of the License at
39 *
40 * http://www.apache.org/licenses/LICENSE-2.0
41 *
42 * Unless required by applicable law or agreed to in writing, software
43 * distributed under the License is distributed on an "AS IS" BASIS,
44 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
45 * See the License for the specific language governing permissions and
46 * limitations under the License.
47 */
48package org.greenstone.pdfbox;
49
50import java.awt.HeadlessException;
51import java.awt.Toolkit;
52import java.awt.image.BufferedImage;
53import java.io.File;
54import java.io.FileOutputStream;
55import java.io.IOException;
56import java.io.OutputStreamWriter;
57import java.io.Writer;
58import org.apache.pdfbox.io.IOUtils;
59
60import javax.imageio.ImageIO;
61
62import org.apache.pdfbox.pdmodel.PDDocument;
63import org.apache.pdfbox.pdmodel.PDPage;
64import org.apache.pdfbox.pdmodel.common.PDRectangle;
65import org.apache.pdfbox.rendering.ImageType;
66import org.apache.pdfbox.rendering.PDFRenderer;
67import org.apache.pdfbox.tools.imageio.ImageIOUtil;
68
69import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
70import org.apache.pdfbox.text.PDFTextStripper;
71
72/**
73 * This class is based on PDFToImage.java which converts
74 * the pages of a PDF document to images.
75 * This class should convert the pages to images and
76 * extract the text of each page. That part of the code
77 * to be taken from ExtractText.java
78 *
79 * Built on Apache PDFBox's PDFToImage.java with minor modifications.
80 * ak19
81 */
82public final class PDFBoxToImagesAndText
83{
84 private static final String PASSWORD = "-password";
85 private static final String ENCODING = "-encoding";
86 private static final String START_PAGE = "-startPage";
87 private static final String END_PAGE = "-endPage";
88 private static final String PAGE = "-page";
89 private static final String IMAGE_TYPE = "-imageType";
90 private static final String FORMAT = "-format";
91 private static final String OUTPUT_PREFIX = "-outputPrefix";
92 private static final String PREFIX = "-prefix";
93 private static final String COLOR = "-color";
94 private static final String RESOLUTION = "-resolution";
95 private static final String DPI = "-dpi";
96 private static final String CROPBOX = "-cropbox";
97 private static final String TIME = "-time";
98
99 private static final String STD_ENCODING = "UTF-8";
100
101 /**
102 * private constructor.
103 */
104 private PDFBoxToImagesAndText()
105 {
106 //static class
107 }
108
109 /**
110 * Infamous main method.
111 *
112 * @param args Command line arguments, should be one and a reference to a file.
113 *
114 * @throws IOException If there is an error parsing the document.
115 */
116 public static void main( String[] args ) throws IOException
117 {
118 try
119 {
120 // force KCMS (faster than LCMS) if available
121 Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
122 System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
123 }
124 catch (ClassNotFoundException e)
125 {
126 // do nothing
127 }
128
129 // suppress the Dock icon on OS X
130 System.setProperty("apple.awt.UIElement", "true");
131
132 String password = "";
133 String encoding = STD_ENCODING;
134 String pdfFile = null;
135 String outputPrefix = null;
136 String imageFormat = "jpg";
137 int startPage = 1;
138 int endPage = Integer.MAX_VALUE;
139 String color = "rgb";
140 int dpi;
141 float cropBoxLowerLeftX = 0;
142 float cropBoxLowerLeftY = 0;
143 float cropBoxUpperRightX = 0;
144 float cropBoxUpperRightY = 0;
145 boolean showTime = false;
146 try
147 {
148 dpi = Toolkit.getDefaultToolkit().getScreenResolution();
149 }
150 catch( HeadlessException e )
151 {
152 dpi = 96;
153 }
154 for( int i = 0; i < args.length; i++ )
155 {
156 if( args[i].equals( PASSWORD ) )
157 {
158 i++;
159 if( i >= args.length )
160 {
161 usage();
162 }
163 password = args[i];
164 }
165 else if( args[i].equals( ENCODING ) )
166 {
167 i++;
168 if( i >= args.length )
169 {
170 usage();
171 }
172 encoding = args[i];
173 }
174 else if( args[i].equals( START_PAGE ) )
175 {
176 i++;
177 if( i >= args.length )
178 {
179 usage();
180 }
181 startPage = Integer.parseInt( args[i] );
182 }
183 else if( args[i].equals( END_PAGE ) )
184 {
185 i++;
186 if( i >= args.length )
187 {
188 usage();
189 }
190 endPage = Integer.parseInt( args[i] );
191 }
192 else if( args[i].equals( PAGE ) )
193 {
194 i++;
195 if( i >= args.length )
196 {
197 usage();
198 }
199 startPage = Integer.parseInt( args[i] );
200 endPage = Integer.parseInt( args[i] );
201 }
202 else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )
203 {
204 i++;
205 imageFormat = args[i];
206 }
207 else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )
208 {
209 i++;
210 outputPrefix = args[i];
211 }
212 else if( args[i].equals( COLOR ) )
213 {
214 i++;
215 color = args[i];
216 }
217 else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )
218 {
219 i++;
220 dpi = Integer.parseInt(args[i]);
221 }
222 else if( args[i].equals( CROPBOX ) )
223 {
224 i++;
225 cropBoxLowerLeftX = Float.valueOf(args[i]);
226 i++;
227 cropBoxLowerLeftY = Float.valueOf(args[i]);
228 i++;
229 cropBoxUpperRightX = Float.valueOf(args[i]);
230 i++;
231 cropBoxUpperRightY = Float.valueOf(args[i]);
232 }
233 else if( args[i].equals( TIME ) )
234 {
235 showTime = true;
236 }
237 else
238 {
239 if( pdfFile == null )
240 {
241 pdfFile = args[i];
242 }
243 }
244 }
245 if( pdfFile == null )
246 {
247 usage();
248 }
249 else
250 {
251 if(outputPrefix == null)
252 {
253 outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
254 }
255
256 PDDocument document = null;
257 try
258 {
259 boolean extractingTextAllowed = true;
260 //String outputFile = null;
261
262 /*startProcessing("Loading PDF "+pdfFile);
263 if( outputFile == null && pdfFile.length() >4 )
264 {
265 outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
266 }*/
267
268 document = PDDocument.load(new File(pdfFile), password);
269
270 AccessPermission ap = document.getCurrentAccessPermission();
271 if( ! ap.canExtractContent() )
272 {
273 //throw new IOException( "You do not have permission to extract text" );
274 System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
275 extractingTextAllowed = false;
276 }
277 //stopProcessing("Time for loading: ", startTime);
278
279 // don't extract to HTML in this class, just extract to txt
280 PDFTextStripper stripper = new PDFTextStripper();
281 //stripper.setSortByPosition( sort );
282 //stripper.setShouldSeparateByBeads( separateBeads );
283 stripper.setShouldSeparateByBeads( true );
284
285
286 ImageType imageType = null;
287 if ("bilevel".equalsIgnoreCase(color))
288 {
289 imageType = ImageType.BINARY;
290 }
291 else if ("gray".equalsIgnoreCase(color))
292 {
293 imageType = ImageType.GRAY;
294 }
295 else if ("rgb".equalsIgnoreCase(color))
296 {
297 imageType = ImageType.RGB;
298 }
299 else if ("rgba".equalsIgnoreCase(color))
300 {
301 imageType = ImageType.ARGB;
302 }
303
304 if (imageType == null)
305 {
306 System.err.println( "Error: Invalid color." );
307 System.exit( 2 );
308 }
309
310 //if a CropBox has been specified, update the CropBox:
311 //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
312 if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0
313 || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
314 {
315 changeCropBox(document,
316 cropBoxLowerLeftX, cropBoxLowerLeftY,
317 cropBoxUpperRightX, cropBoxUpperRightY);
318 }
319
320 long startTime = System.nanoTime();
321
322 // render the pages
323 boolean success = true;
324 endPage = Math.min(endPage, document.getNumberOfPages());
325 PDFRenderer renderer = new PDFRenderer(document);
326 for (int i = startPage - 1; i < endPage; i++)
327 {
328 // turn page into image
329 BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
330 int lastSlash = outputPrefix.lastIndexOf(File.separator);
331 outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
332 String fileName = outputPrefix + (i + 1) + ".";
333 success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
334
335
336 // image version of page done, now extract text from current page
337 if(extractingTextAllowed) {
338 Writer output = null;
339 try {
340 output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
341 stripper.setStartPage( i+1 );
342 stripper.setEndPage( i+1 );
343
344 //if (debug)
345 //{
346 System.err.println("Writing to "+fileName);
347 //}
348
349 // Extract text for main document, the specified pages
350 stripper.writeText( document, output );
351 } catch (Exception ex) {
352 System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
353 } finally {
354 IOUtils.closeQuietly(output);
355 }
356 }
357 }
358
359 // GS NOTE: We just extracted text for (each page of) the main document, but
360 // we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
361
362 // performance stats
363 long endTime = System.nanoTime();
364 long duration = endTime - startTime;
365 int count = 1 + endPage - startPage;
366 if (showTime)
367 {
368 System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
369 duration / 1000000);
370 }
371
372 if (!success)
373 {
374 System.err.println( "Error: no writer found for image format '"
375 + imageFormat + "'" );
376 System.exit(1);
377 }
378 }
379 finally
380 {
381 if( document != null )
382 {
383 document.close();
384 }
385 }
386 }
387 }
388
389 /**
390 * This will print the usage requirements and exit.
391 */
392 private static void usage()
393 {
394 String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFBoxToImagesAndText [options] <inputfile>\n"
395 + "\nOptions:\n"
396 + " -password <password> : Password to decrypt document\n"
397 + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
398 + " -format <string> : Image format: " + getImageFormats() + "\n"
399 + " -prefix <string> : Filename prefix for image files\n"
400 + " -page <number> : The only page to extract (1-based)\n"
401 + " -startPage <int> : The first page to start extraction (1-based)\n"
402 + " -endPage <int> : The last page to extract(inclusive)\n"
403 + " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
404 + " -dpi <int> : The DPI of the output image\n"
405 + " -cropbox <int> <int> <int> <int> : The page area to export\n"
406 + " -time : Prints timing information to stdout\n"
407 + " <inputfile> : The PDF document to use\n";
408
409 System.err.println(message);
410 System.exit( 1 );
411 }
412
413 private static String getImageFormats()
414 {
415 StringBuilder retval = new StringBuilder();
416 String[] formats = ImageIO.getReaderFormatNames();
417 for( int i = 0; i < formats.length; i++ )
418 {
419 if (formats[i].equalsIgnoreCase(formats[i]))
420 {
421 retval.append( formats[i] );
422 if( i + 1 < formats.length )
423 {
424 retval.append( ", " );
425 }
426 }
427 }
428 return retval.toString();
429 }
430
431 private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
432 {
433 for (PDPage page : document.getPages())
434 {
435 System.out.println("resizing page");
436 PDRectangle rectangle = new PDRectangle();
437 rectangle.setLowerLeftX(a);
438 rectangle.setLowerLeftY(b);
439 rectangle.setUpperRightX(c);
440 rectangle.setUpperRightY(d);
441 page.setCropBox(rectangle);
442
443 }
444 }
445}
Note: See TracBrowser for help on using the repository browser.