[32193] | 1 | /*
|
---|
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more
|
---|
| 3 | * contributor license agreements. See the NOTICE file distributed with
|
---|
| 4 | * this work for additional information regarding copyright ownership.
|
---|
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0
|
---|
| 6 | * (the "License"); you may not use this file except in compliance with
|
---|
| 7 | * the License. You may obtain a copy of the License at
|
---|
| 8 | *
|
---|
| 9 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 10 | *
|
---|
| 11 | * Unless required by applicable law or agreed to in writing, software
|
---|
| 12 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
| 14 | * See the License for the specific language governing permissions and
|
---|
| 15 | * limitations under the License.
|
---|
| 16 | */
|
---|
| 17 | package org.apache.pdfbox.tools;
|
---|
| 18 |
|
---|
| 19 | import java.awt.HeadlessException;
|
---|
| 20 | import java.awt.Toolkit;
|
---|
| 21 | import java.awt.image.BufferedImage;
|
---|
| 22 | import java.io.File;
|
---|
| 23 | import java.io.FileOutputStream;
|
---|
| 24 | import java.io.IOException;
|
---|
| 25 | import java.io.OutputStreamWriter;
|
---|
| 26 | import java.io.Writer;
|
---|
| 27 | import org.apache.pdfbox.io.IOUtils;
|
---|
| 28 |
|
---|
| 29 | import javax.imageio.ImageIO;
|
---|
| 30 |
|
---|
| 31 | import org.apache.pdfbox.pdmodel.PDDocument;
|
---|
| 32 | import org.apache.pdfbox.pdmodel.PDPage;
|
---|
| 33 | import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
---|
| 34 | import org.apache.pdfbox.rendering.ImageType;
|
---|
| 35 | import org.apache.pdfbox.rendering.PDFRenderer;
|
---|
| 36 | import org.apache.pdfbox.tools.imageio.ImageIOUtil;
|
---|
| 37 |
|
---|
| 38 | import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
|
---|
| 39 | import org.apache.pdfbox.text.PDFTextStripper;
|
---|
| 40 |
|
---|
| 41 | /**
|
---|
| 42 | * This class is based on PDFToImage.java which converts
|
---|
| 43 | * the pages of a PDF document to images.
|
---|
| 44 | * This class should convert the pages to images and
|
---|
| 45 | * extract the text of each page. That part of the code
|
---|
| 46 | * to be taken from ExtractText.java
|
---|
| 47 | *
|
---|
| 48 | * Built on Apache PDFBox's PDFToImage.java with minor modifications.
|
---|
| 49 | * ak19
|
---|
| 50 | */
|
---|
| 51 | public final class GS_PDFToImagesAndText
|
---|
| 52 | {
|
---|
| 53 | private static final String PASSWORD = "-password";
|
---|
| 54 | private static final String ENCODING = "-encoding";
|
---|
| 55 | private static final String START_PAGE = "-startPage";
|
---|
| 56 | private static final String END_PAGE = "-endPage";
|
---|
| 57 | private static final String PAGE = "-page";
|
---|
| 58 | private static final String IMAGE_TYPE = "-imageType";
|
---|
| 59 | private static final String FORMAT = "-format";
|
---|
| 60 | private static final String OUTPUT_PREFIX = "-outputPrefix";
|
---|
| 61 | private static final String PREFIX = "-prefix";
|
---|
| 62 | private static final String COLOR = "-color";
|
---|
| 63 | private static final String RESOLUTION = "-resolution";
|
---|
| 64 | private static final String DPI = "-dpi";
|
---|
| 65 | private static final String CROPBOX = "-cropbox";
|
---|
| 66 | private static final String TIME = "-time";
|
---|
| 67 |
|
---|
| 68 | private static final String STD_ENCODING = "UTF-8";
|
---|
| 69 |
|
---|
| 70 | /**
|
---|
| 71 | * private constructor.
|
---|
| 72 | */
|
---|
| 73 | private GS_PDFToImagesAndText()
|
---|
| 74 | {
|
---|
| 75 | //static class
|
---|
| 76 | }
|
---|
| 77 |
|
---|
| 78 | /**
|
---|
| 79 | * Infamous main method.
|
---|
| 80 | *
|
---|
| 81 | * @param args Command line arguments, should be one and a reference to a file.
|
---|
| 82 | *
|
---|
| 83 | * @throws IOException If there is an error parsing the document.
|
---|
| 84 | */
|
---|
| 85 | public static void main( String[] args ) throws IOException
|
---|
| 86 | {
|
---|
| 87 | try
|
---|
| 88 | {
|
---|
| 89 | // force KCMS (faster than LCMS) if available
|
---|
| 90 | Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
|
---|
| 91 | System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
|
---|
| 92 | }
|
---|
| 93 | catch (ClassNotFoundException e)
|
---|
| 94 | {
|
---|
| 95 | // do nothing
|
---|
| 96 | }
|
---|
| 97 |
|
---|
| 98 | // suppress the Dock icon on OS X
|
---|
| 99 | System.setProperty("apple.awt.UIElement", "true");
|
---|
| 100 |
|
---|
| 101 | String password = "";
|
---|
| 102 | String encoding = STD_ENCODING;
|
---|
| 103 | String pdfFile = null;
|
---|
| 104 | String outputPrefix = null;
|
---|
| 105 | String imageFormat = "jpg";
|
---|
| 106 | int startPage = 1;
|
---|
| 107 | int endPage = Integer.MAX_VALUE;
|
---|
| 108 | String color = "rgb";
|
---|
| 109 | int dpi;
|
---|
| 110 | float cropBoxLowerLeftX = 0;
|
---|
| 111 | float cropBoxLowerLeftY = 0;
|
---|
| 112 | float cropBoxUpperRightX = 0;
|
---|
| 113 | float cropBoxUpperRightY = 0;
|
---|
| 114 | boolean showTime = false;
|
---|
| 115 | try
|
---|
| 116 | {
|
---|
| 117 | dpi = Toolkit.getDefaultToolkit().getScreenResolution();
|
---|
| 118 | }
|
---|
| 119 | catch( HeadlessException e )
|
---|
| 120 | {
|
---|
| 121 | dpi = 96;
|
---|
| 122 | }
|
---|
| 123 | for( int i = 0; i < args.length; i++ )
|
---|
| 124 | {
|
---|
| 125 | if( args[i].equals( PASSWORD ) )
|
---|
| 126 | {
|
---|
| 127 | i++;
|
---|
| 128 | if( i >= args.length )
|
---|
| 129 | {
|
---|
| 130 | usage();
|
---|
| 131 | }
|
---|
| 132 | password = args[i];
|
---|
| 133 | }
|
---|
| 134 | else if( args[i].equals( ENCODING ) )
|
---|
| 135 | {
|
---|
| 136 | i++;
|
---|
| 137 | if( i >= args.length )
|
---|
| 138 | {
|
---|
| 139 | usage();
|
---|
| 140 | }
|
---|
| 141 | encoding = args[i];
|
---|
| 142 | }
|
---|
| 143 | else if( args[i].equals( START_PAGE ) )
|
---|
| 144 | {
|
---|
| 145 | i++;
|
---|
| 146 | if( i >= args.length )
|
---|
| 147 | {
|
---|
| 148 | usage();
|
---|
| 149 | }
|
---|
| 150 | startPage = Integer.parseInt( args[i] );
|
---|
| 151 | }
|
---|
| 152 | else if( args[i].equals( END_PAGE ) )
|
---|
| 153 | {
|
---|
| 154 | i++;
|
---|
| 155 | if( i >= args.length )
|
---|
| 156 | {
|
---|
| 157 | usage();
|
---|
| 158 | }
|
---|
| 159 | endPage = Integer.parseInt( args[i] );
|
---|
| 160 | }
|
---|
| 161 | else if( args[i].equals( PAGE ) )
|
---|
| 162 | {
|
---|
| 163 | i++;
|
---|
| 164 | if( i >= args.length )
|
---|
| 165 | {
|
---|
| 166 | usage();
|
---|
| 167 | }
|
---|
| 168 | startPage = Integer.parseInt( args[i] );
|
---|
| 169 | endPage = Integer.parseInt( args[i] );
|
---|
| 170 | }
|
---|
| 171 | else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )
|
---|
| 172 | {
|
---|
| 173 | i++;
|
---|
| 174 | imageFormat = args[i];
|
---|
| 175 | }
|
---|
| 176 | else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )
|
---|
| 177 | {
|
---|
| 178 | i++;
|
---|
| 179 | outputPrefix = args[i];
|
---|
| 180 | }
|
---|
| 181 | else if( args[i].equals( COLOR ) )
|
---|
| 182 | {
|
---|
| 183 | i++;
|
---|
| 184 | color = args[i];
|
---|
| 185 | }
|
---|
| 186 | else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )
|
---|
| 187 | {
|
---|
| 188 | i++;
|
---|
| 189 | dpi = Integer.parseInt(args[i]);
|
---|
| 190 | }
|
---|
| 191 | else if( args[i].equals( CROPBOX ) )
|
---|
| 192 | {
|
---|
| 193 | i++;
|
---|
| 194 | cropBoxLowerLeftX = Float.valueOf(args[i]);
|
---|
| 195 | i++;
|
---|
| 196 | cropBoxLowerLeftY = Float.valueOf(args[i]);
|
---|
| 197 | i++;
|
---|
| 198 | cropBoxUpperRightX = Float.valueOf(args[i]);
|
---|
| 199 | i++;
|
---|
| 200 | cropBoxUpperRightY = Float.valueOf(args[i]);
|
---|
| 201 | }
|
---|
| 202 | else if( args[i].equals( TIME ) )
|
---|
| 203 | {
|
---|
| 204 | showTime = true;
|
---|
| 205 | }
|
---|
| 206 | else
|
---|
| 207 | {
|
---|
| 208 | if( pdfFile == null )
|
---|
| 209 | {
|
---|
| 210 | pdfFile = args[i];
|
---|
| 211 | }
|
---|
| 212 | }
|
---|
| 213 | }
|
---|
| 214 | if( pdfFile == null )
|
---|
| 215 | {
|
---|
| 216 | usage();
|
---|
| 217 | }
|
---|
| 218 | else
|
---|
| 219 | {
|
---|
| 220 | if(outputPrefix == null)
|
---|
| 221 | {
|
---|
| 222 | outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
|
---|
| 223 | }
|
---|
| 224 |
|
---|
| 225 | PDDocument document = null;
|
---|
| 226 | try
|
---|
| 227 | {
|
---|
| 228 | boolean extractingTextAllowed = true;
|
---|
| 229 | //String outputFile = null;
|
---|
| 230 |
|
---|
| 231 | /*startProcessing("Loading PDF "+pdfFile);
|
---|
| 232 | if( outputFile == null && pdfFile.length() >4 )
|
---|
| 233 | {
|
---|
| 234 | outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
|
---|
| 235 | }*/
|
---|
| 236 |
|
---|
| 237 | document = PDDocument.load(new File(pdfFile), password);
|
---|
| 238 |
|
---|
| 239 | AccessPermission ap = document.getCurrentAccessPermission();
|
---|
| 240 | if( ! ap.canExtractContent() )
|
---|
| 241 | {
|
---|
| 242 | //throw new IOException( "You do not have permission to extract text" );
|
---|
| 243 | System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
|
---|
| 244 | extractingTextAllowed = false;
|
---|
| 245 | }
|
---|
| 246 | //stopProcessing("Time for loading: ", startTime);
|
---|
| 247 |
|
---|
| 248 | // don't extract to HTML in this class, just extract to txt
|
---|
| 249 | PDFTextStripper stripper = new PDFTextStripper();
|
---|
| 250 | //stripper.setSortByPosition( sort );
|
---|
| 251 | //stripper.setShouldSeparateByBeads( separateBeads );
|
---|
| 252 | stripper.setShouldSeparateByBeads( true );
|
---|
| 253 |
|
---|
| 254 |
|
---|
| 255 | ImageType imageType = null;
|
---|
| 256 | if ("bilevel".equalsIgnoreCase(color))
|
---|
| 257 | {
|
---|
| 258 | imageType = ImageType.BINARY;
|
---|
| 259 | }
|
---|
| 260 | else if ("gray".equalsIgnoreCase(color))
|
---|
| 261 | {
|
---|
| 262 | imageType = ImageType.GRAY;
|
---|
| 263 | }
|
---|
| 264 | else if ("rgb".equalsIgnoreCase(color))
|
---|
| 265 | {
|
---|
| 266 | imageType = ImageType.RGB;
|
---|
| 267 | }
|
---|
| 268 | else if ("rgba".equalsIgnoreCase(color))
|
---|
| 269 | {
|
---|
| 270 | imageType = ImageType.ARGB;
|
---|
| 271 | }
|
---|
| 272 |
|
---|
| 273 | if (imageType == null)
|
---|
| 274 | {
|
---|
| 275 | System.err.println( "Error: Invalid color." );
|
---|
| 276 | System.exit( 2 );
|
---|
| 277 | }
|
---|
| 278 |
|
---|
| 279 | //if a CropBox has been specified, update the CropBox:
|
---|
| 280 | //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
|
---|
| 281 | if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0
|
---|
| 282 | || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
|
---|
| 283 | {
|
---|
| 284 | changeCropBox(document,
|
---|
| 285 | cropBoxLowerLeftX, cropBoxLowerLeftY,
|
---|
| 286 | cropBoxUpperRightX, cropBoxUpperRightY);
|
---|
| 287 | }
|
---|
| 288 |
|
---|
| 289 | long startTime = System.nanoTime();
|
---|
| 290 |
|
---|
| 291 | // render the pages
|
---|
| 292 | boolean success = true;
|
---|
| 293 | endPage = Math.min(endPage, document.getNumberOfPages());
|
---|
| 294 | PDFRenderer renderer = new PDFRenderer(document);
|
---|
| 295 | for (int i = startPage - 1; i < endPage; i++)
|
---|
| 296 | {
|
---|
| 297 | // turn page into image
|
---|
| 298 | BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
|
---|
| 299 | int lastSlash = outputPrefix.lastIndexOf(File.separator);
|
---|
| 300 | outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
|
---|
| 301 | String fileName = outputPrefix + (i + 1) + ".";
|
---|
| 302 | success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
|
---|
| 303 |
|
---|
| 304 |
|
---|
| 305 | // image version of page done, now extract text from current page
|
---|
| 306 | if(extractingTextAllowed) {
|
---|
| 307 | Writer output = null;
|
---|
| 308 | try {
|
---|
| 309 | output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
|
---|
| 310 | stripper.setStartPage( i+1 );
|
---|
| 311 | stripper.setEndPage( i+1 );
|
---|
| 312 |
|
---|
| 313 | //if (debug)
|
---|
| 314 | //{
|
---|
| 315 | System.err.println("Writing to "+fileName);
|
---|
| 316 | //}
|
---|
| 317 |
|
---|
| 318 | // Extract text for main document, the specified pages
|
---|
| 319 | stripper.writeText( document, output );
|
---|
| 320 | } catch (Exception ex) {
|
---|
| 321 | System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
|
---|
| 322 | } finally {
|
---|
| 323 | IOUtils.closeQuietly(output);
|
---|
| 324 | }
|
---|
| 325 | }
|
---|
| 326 | }
|
---|
| 327 |
|
---|
| 328 | // GS NOTE: We just extracted text for (each page of) the main document, but
|
---|
| 329 | // we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
|
---|
| 330 |
|
---|
| 331 | // performance stats
|
---|
| 332 | long endTime = System.nanoTime();
|
---|
| 333 | long duration = endTime - startTime;
|
---|
| 334 | int count = 1 + endPage - startPage;
|
---|
| 335 | if (showTime)
|
---|
| 336 | {
|
---|
| 337 | System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
|
---|
| 338 | duration / 1000000);
|
---|
| 339 | }
|
---|
| 340 |
|
---|
| 341 | if (!success)
|
---|
| 342 | {
|
---|
| 343 | System.err.println( "Error: no writer found for image format '"
|
---|
| 344 | + imageFormat + "'" );
|
---|
| 345 | System.exit(1);
|
---|
| 346 | }
|
---|
| 347 | }
|
---|
| 348 | finally
|
---|
| 349 | {
|
---|
| 350 | if( document != null )
|
---|
| 351 | {
|
---|
| 352 | document.close();
|
---|
| 353 | }
|
---|
| 354 | }
|
---|
| 355 | }
|
---|
| 356 | }
|
---|
| 357 |
|
---|
| 358 | /**
|
---|
| 359 | * This will print the usage requirements and exit.
|
---|
| 360 | */
|
---|
| 361 | private static void usage()
|
---|
| 362 | {
|
---|
| 363 | String message = "Usage: java -jar pdfbox-app-x.y.z.jar GS_PDFToImagesAndText [options] <inputfile>\n"
|
---|
| 364 | + "\nOptions:\n"
|
---|
| 365 | + " -password <password> : Password to decrypt document\n"
|
---|
| 366 | + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
|
---|
| 367 | + " -format <string> : Image format: " + getImageFormats() + "\n"
|
---|
| 368 | + " -prefix <string> : Filename prefix for image files\n"
|
---|
| 369 | + " -page <number> : The only page to extract (1-based)\n"
|
---|
| 370 | + " -startPage <int> : The first page to start extraction (1-based)\n"
|
---|
| 371 | + " -endPage <int> : The last page to extract(inclusive)\n"
|
---|
| 372 | + " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
|
---|
| 373 | + " -dpi <int> : The DPI of the output image\n"
|
---|
| 374 | + " -cropbox <int> <int> <int> <int> : The page area to export\n"
|
---|
| 375 | + " -time : Prints timing information to stdout\n"
|
---|
| 376 | + " <inputfile> : The PDF document to use\n";
|
---|
| 377 |
|
---|
| 378 | System.err.println(message);
|
---|
| 379 | System.exit( 1 );
|
---|
| 380 | }
|
---|
| 381 |
|
---|
| 382 | private static String getImageFormats()
|
---|
| 383 | {
|
---|
| 384 | StringBuilder retval = new StringBuilder();
|
---|
| 385 | String[] formats = ImageIO.getReaderFormatNames();
|
---|
| 386 | for( int i = 0; i < formats.length; i++ )
|
---|
| 387 | {
|
---|
| 388 | if (formats[i].equalsIgnoreCase(formats[i]))
|
---|
| 389 | {
|
---|
| 390 | retval.append( formats[i] );
|
---|
| 391 | if( i + 1 < formats.length )
|
---|
| 392 | {
|
---|
| 393 | retval.append( ", " );
|
---|
| 394 | }
|
---|
| 395 | }
|
---|
| 396 | }
|
---|
| 397 | return retval.toString();
|
---|
| 398 | }
|
---|
| 399 |
|
---|
| 400 | private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
|
---|
| 401 | {
|
---|
| 402 | for (PDPage page : document.getPages())
|
---|
| 403 | {
|
---|
| 404 | System.out.println("resizing page");
|
---|
| 405 | PDRectangle rectangle = new PDRectangle();
|
---|
| 406 | rectangle.setLowerLeftX(a);
|
---|
| 407 | rectangle.setLowerLeftY(b);
|
---|
| 408 | rectangle.setUpperRightX(c);
|
---|
| 409 | rectangle.setUpperRightY(d);
|
---|
| 410 | page.setCropBox(rectangle);
|
---|
| 411 |
|
---|
| 412 | }
|
---|
| 413 | }
|
---|
| 414 | }
|
---|