/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.tools; import java.awt.HeadlessException; import java.awt.Toolkit; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import org.apache.pdfbox.io.IOUtils; import javax.imageio.ImageIO; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.text.PDFTextStripper; /** * This class is based on PDFToImage.java which converts * the pages of a PDF document to images. * This class should convert the pages to images and * extract the text of each page. That part of the code * to be taken from ExtractText.java * * Built on Apache PDFBox's PDFToImage.java with minor modifications. * ak19 */ public final class GS_PDFToImagesAndText { private static final String PASSWORD = "-password"; private static final String ENCODING = "-encoding"; private static final String START_PAGE = "-startPage"; private static final String END_PAGE = "-endPage"; private static final String PAGE = "-page"; private static final String IMAGE_TYPE = "-imageType"; private static final String FORMAT = "-format"; private static final String OUTPUT_PREFIX = "-outputPrefix"; private static final String PREFIX = "-prefix"; private static final String COLOR = "-color"; private static final String RESOLUTION = "-resolution"; private static final String DPI = "-dpi"; private static final String CROPBOX = "-cropbox"; private static final String TIME = "-time"; private static final String STD_ENCODING = "UTF-8"; /** * private constructor. */ private GS_PDFToImagesAndText() { //static class } /** * Infamous main method. * * @param args Command line arguments, should be one and a reference to a file. * * @throws IOException If there is an error parsing the document. */ public static void main( String[] args ) throws IOException { try { // force KCMS (faster than LCMS) if available Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider"); System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); } catch (ClassNotFoundException e) { // do nothing } // suppress the Dock icon on OS X System.setProperty("apple.awt.UIElement", "true"); String password = ""; String encoding = STD_ENCODING; String pdfFile = null; String outputPrefix = null; String imageFormat = "jpg"; int startPage = 1; int endPage = Integer.MAX_VALUE; String color = "rgb"; int dpi; float cropBoxLowerLeftX = 0; float cropBoxLowerLeftY = 0; float cropBoxUpperRightX = 0; float cropBoxUpperRightY = 0; boolean showTime = false; try { dpi = Toolkit.getDefaultToolkit().getScreenResolution(); } catch( HeadlessException e ) { dpi = 96; } for( int i = 0; i < args.length; i++ ) { if( args[i].equals( PASSWORD ) ) { i++; if( i >= args.length ) { usage(); } password = args[i]; } else if( args[i].equals( ENCODING ) ) { i++; if( i >= args.length ) { usage(); } encoding = args[i]; } else if( args[i].equals( START_PAGE ) ) { i++; if( i >= args.length ) { usage(); } startPage = Integer.parseInt( args[i] ); } else if( args[i].equals( END_PAGE ) ) { i++; if( i >= args.length ) { usage(); } endPage = Integer.parseInt( args[i] ); } else if( args[i].equals( PAGE ) ) { i++; if( i >= args.length ) { usage(); } startPage = Integer.parseInt( args[i] ); endPage = Integer.parseInt( args[i] ); } else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) ) { i++; imageFormat = args[i]; } else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) ) { i++; outputPrefix = args[i]; } else if( args[i].equals( COLOR ) ) { i++; color = args[i]; } else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) ) { i++; dpi = Integer.parseInt(args[i]); } else if( args[i].equals( CROPBOX ) ) { i++; cropBoxLowerLeftX = Float.valueOf(args[i]); i++; cropBoxLowerLeftY = Float.valueOf(args[i]); i++; cropBoxUpperRightX = Float.valueOf(args[i]); i++; cropBoxUpperRightY = Float.valueOf(args[i]); } else if( args[i].equals( TIME ) ) { showTime = true; } else { if( pdfFile == null ) { pdfFile = args[i]; } } } if( pdfFile == null ) { usage(); } else { if(outputPrefix == null) { outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' )); } PDDocument document = null; try { boolean extractingTextAllowed = true; //String outputFile = null; /*startProcessing("Loading PDF "+pdfFile); if( outputFile == null && pdfFile.length() >4 ) { outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath(); }*/ document = PDDocument.load(new File(pdfFile), password); AccessPermission ap = document.getCurrentAccessPermission(); if( ! ap.canExtractContent() ) { //throw new IOException( "You do not have permission to extract text" ); System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then? extractingTextAllowed = false; } //stopProcessing("Time for loading: ", startTime); // don't extract to HTML in this class, just extract to txt PDFTextStripper stripper = new PDFTextStripper(); //stripper.setSortByPosition( sort ); //stripper.setShouldSeparateByBeads( separateBeads ); stripper.setShouldSeparateByBeads( true ); ImageType imageType = null; if ("bilevel".equalsIgnoreCase(color)) { imageType = ImageType.BINARY; } else if ("gray".equalsIgnoreCase(color)) { imageType = ImageType.GRAY; } else if ("rgb".equalsIgnoreCase(color)) { imageType = ImageType.RGB; } else if ("rgba".equalsIgnoreCase(color)) { imageType = ImageType.ARGB; } if (imageType == null) { System.err.println( "Error: Invalid color." ); System.exit( 2 ); } //if a CropBox has been specified, update the CropBox: //changeCropBoxes(PDDocument document,float a, float b, float c,float d) if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0 || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 ) { changeCropBox(document, cropBoxLowerLeftX, cropBoxLowerLeftY, cropBoxUpperRightX, cropBoxUpperRightY); } long startTime = System.nanoTime(); // render the pages boolean success = true; endPage = Math.min(endPage, document.getNumberOfPages()); PDFRenderer renderer = new PDFRenderer(document); for (int i = startPage - 1; i < endPage; i++) { // turn page into image BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType); int lastSlash = outputPrefix.lastIndexOf(File.separator); outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix String fileName = outputPrefix + (i + 1) + "."; success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi); // image version of page done, now extract text from current page if(extractingTextAllowed) { Writer output = null; try { output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding ); stripper.setStartPage( i+1 ); stripper.setEndPage( i+1 ); //if (debug) //{ System.err.println("Writing to "+fileName); //} // Extract text for main document, the specified pages stripper.writeText( document, output ); } catch (Exception ex) { System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage()); } finally { IOUtils.closeQuietly(output); } } } // GS NOTE: We just extracted text for (each page of) the main document, but // we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java // performance stats long endTime = System.nanoTime(); long duration = endTime - startTime; int count = 1 + endPage - startPage; if (showTime) { System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s", duration / 1000000); } if (!success) { System.err.println( "Error: no writer found for image format '" + imageFormat + "'" ); System.exit(1); } } finally { if( document != null ) { document.close(); } } } } /** * This will print the usage requirements and exit. */ private static void usage() { String message = "Usage: java -jar pdfbox-app-x.y.z.jar GS_PDFToImagesAndText [options] \n" + "\nOptions:\n" + " -password : Password to decrypt document\n" + " -encoding : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n" + " -format : Image format: " + getImageFormats() + "\n" + " -prefix : Filename prefix for image files\n" + " -page : The only page to extract (1-based)\n" + " -startPage : The first page to start extraction (1-based)\n" + " -endPage : The last page to extract(inclusive)\n" + " -color : The color depth (valid: bilevel, gray, rgb, rgba)\n" + " -dpi : The DPI of the output image\n" + " -cropbox : The page area to export\n" + " -time : Prints timing information to stdout\n" + " : The PDF document to use\n"; System.err.println(message); System.exit( 1 ); } private static String getImageFormats() { StringBuilder retval = new StringBuilder(); String[] formats = ImageIO.getReaderFormatNames(); for( int i = 0; i < formats.length; i++ ) { if (formats[i].equalsIgnoreCase(formats[i])) { retval.append( formats[i] ); if( i + 1 < formats.length ) { retval.append( ", " ); } } } return retval.toString(); } private static void changeCropBox(PDDocument document, float a, float b, float c, float d) { for (PDPage page : document.getPages()) { System.out.println("resizing page"); PDRectangle rectangle = new PDRectangle(); rectangle.setLowerLeftX(a); rectangle.setLowerLeftY(b); rectangle.setUpperRightX(c); rectangle.setUpperRightY(d); page.setCropBox(rectangle); } } }