Context Navigation

PDFBoxToImagesAndText.java@ 32278

Last change on this file since 32278 was 32278, checked in by ak19, 6 years ago
Our custom pdf-box class PDFToImagesAndText.java now takes two additional flags, textOnly and imagesOnly, which can be used to support paged_text and the original pagedimg_ output formats, besides pagedimgtxt_
File size: 16.3 KB

Line
1	/**********************************************************************
2	*
3	* PDFBoxToImagesAndText.java based on Apache PDFBoxÂ®'s PDFToImage.java
4	* with further code spliced in from its ExtractImages.java with some
5	* minor modifications.
6	*
7	* The code in this file is therefore under the same Apache License
8	* version 2.0 as Apache's PDFBox.
9	*
10	* Copyright 2018 The New Zealand Digital Library Project
11	*
12	* A component of the Greenstone digital library software
13	* from the New Zealand Digital Library Project at the
14	* University of Waikato, New Zealand.
15	*
16	* This program is free software; you can redistribute it and/or modify
17	* it under the terms of the Apache License version 2.0.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the Apache License version 2.0
25	* along with this file; if not, refer to
26	* https://www.apache.org/licenses/LICENSE-2.0.
27	*
28	* The following comment is from the original file,
29	* PDFBox's PDFToImage.java
30	*
31	*********************************************************************/
32	/*
33	* Licensed to the Apache Software Foundation (ASF) under one or more
34	* contributor license agreements. See the NOTICE file distributed with
35	* this work for additional information regarding copyright ownership.
36	* The ASF licenses this file to You under the Apache License, Version 2.0
37	* (the "License"); you may not use this file except in compliance with
38	* the License. You may obtain a copy of the License at
39	*
40	* http://www.apache.org/licenses/LICENSE-2.0
41	*
42	* Unless required by applicable law or agreed to in writing, software
43	* distributed under the License is distributed on an "AS IS" BASIS,
44	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
45	* See the License for the specific language governing permissions and
46	* limitations under the License.
47	*/
48	package org.greenstone.pdfbox;
49
50	import java.awt.HeadlessException;
51	import java.awt.Toolkit;
52	import java.awt.image.BufferedImage;
53	import java.io.File;
54	import java.io.FileOutputStream;
55	import java.io.IOException;
56	import java.io.OutputStreamWriter;
57	import java.io.Writer;
58	import org.apache.pdfbox.io.IOUtils;
59
60	import javax.imageio.ImageIO;
61
62	import org.apache.pdfbox.pdmodel.PDDocument;
63	import org.apache.pdfbox.pdmodel.PDPage;
64	import org.apache.pdfbox.pdmodel.common.PDRectangle;
65	import org.apache.pdfbox.rendering.ImageType;
66	import org.apache.pdfbox.rendering.PDFRenderer;
67	import org.apache.pdfbox.tools.imageio.ImageIOUtil;
68
69	import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
70	import org.apache.pdfbox.text.PDFTextStripper;
71
72	/**
73	* This class is based on PDFToImage.java which converts
74	* the pages of a PDF document to images.
75	* This class should convert the pages to images and
76	* extract the text of each page. The latter part of the code
77	* is taken from ExtractText.java.
78	* Variables textOnly and imagesOnly determine which aspect
79	* is output for each page, or whether both an image and text
80	* are output per page.
81	*
82	* Built on Apache PDFBox's PDFToImage.java with minor modifications.
83	* ak19
84	*/
85	public final class PDFBoxToImagesAndText
86	{
87	private static final String PASSWORD = "-password";
88	private static final String ENCODING = "-encoding";
89	private static final String START_PAGE = "-startPage";
90	private static final String END_PAGE = "-endPage";
91	private static final String PAGE = "-page";
92	private static final String IMAGE_TYPE = "-imageType";
93	private static final String FORMAT = "-format";
94	private static final String OUTPUT_PREFIX = "-outputPrefix";
95	private static final String PREFIX = "-prefix";
96	private static final String COLOR = "-color";
97	private static final String RESOLUTION = "-resolution";
98	private static final String DPI = "-dpi";
99	private static final String CROPBOX = "-cropbox";
100	private static final String TIME = "-time";
101	private static final String TEXT_ONLY = "-textOnly"; // output just the text per page
102	private static final String IMAGES_ONLY = "-imagesOnly"; // output just an image per page
103
104	private static final String STD_ENCODING = "UTF-8";
105
106	/**
107	* private constructor.
108	*/
109	private PDFBoxToImagesAndText()
110	{
111	//static class
112	}
113
114	/**
115	* Infamous main method.
116	*
117	* @param args Command line arguments, should be one and a reference to a file.
118	*
119	* @throws IOException If there is an error parsing the document.
120	*/
121	public static void main( String[] args ) throws IOException
122	{
123	try
124	{
125	// force KCMS (faster than LCMS) if available
126	Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
127	System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
128	}
129	catch (ClassNotFoundException e)
130	{
131	// do nothing
132	}
133
134	// suppress the Dock icon on OS X
135	System.setProperty("apple.awt.UIElement", "true");
136
137	String password = "";
138	String encoding = STD_ENCODING;
139	String pdfFile = null;
140	String outputPrefix = null;
141	String imageFormat = "jpg";
142	int startPage = 1;
143	int endPage = Integer.MAX_VALUE;
144	String color = "rgb";
145	int dpi;
146	float cropBoxLowerLeftX = 0;
147	float cropBoxLowerLeftY = 0;
148	float cropBoxUpperRightX = 0;
149	float cropBoxUpperRightY = 0;
150	boolean showTime = false;
151	boolean textOnly = false;
152	boolean imagesOnly = false;
153
154	try
155	{
156	dpi = Toolkit.getDefaultToolkit().getScreenResolution();
157	}
158	catch( HeadlessException e )
159	{
160	dpi = 96;
161	}
162	for( int i = 0; i < args.length; i++ )
163	{
164	if( args[i].equals( PASSWORD ) )
165	{
166	i++;
167	if( i >= args.length )
168	{
169	usage();
170	}
171	password = args[i];
172	}
173	else if( args[i].equals( ENCODING ) )
174	{
175	i++;
176	if( i >= args.length )
177	{
178	usage();
179	}
180	encoding = args[i];
181	}
182	else if( args[i].equals( START_PAGE ) )
183	{
184	i++;
185	if( i >= args.length )
186	{
187	usage();
188	}
189	startPage = Integer.parseInt( args[i] );
190	}
191	else if( args[i].equals( END_PAGE ) )
192	{
193	i++;
194	if( i >= args.length )
195	{
196	usage();
197	}
198	endPage = Integer.parseInt( args[i] );
199	}
200	else if( args[i].equals( PAGE ) )
201	{
202	i++;
203	if( i >= args.length )
204	{
205	usage();
206	}
207	startPage = Integer.parseInt( args[i] );
208	endPage = Integer.parseInt( args[i] );
209	}
210	else if( args[i].equals(IMAGE_TYPE) \|\| args[i].equals(FORMAT) )
211	{
212	i++;
213	imageFormat = args[i];
214	}
215	else if( args[i].equals( OUTPUT_PREFIX ) \|\| args[i].equals( PREFIX ) )
216	{
217	i++;
218	outputPrefix = args[i];
219	}
220	else if( args[i].equals( COLOR ) )
221	{
222	i++;
223	color = args[i];
224	}
225	else if( args[i].equals( RESOLUTION ) \|\| args[i].equals( DPI ) )
226	{
227	i++;
228	dpi = Integer.parseInt(args[i]);
229	}
230	else if( args[i].equals( CROPBOX ) )
231	{
232	i++;
233	cropBoxLowerLeftX = Float.valueOf(args[i]);
234	i++;
235	cropBoxLowerLeftY = Float.valueOf(args[i]);
236	i++;
237	cropBoxUpperRightX = Float.valueOf(args[i]);
238	i++;
239	cropBoxUpperRightY = Float.valueOf(args[i]);
240	}
241	else if( args[i].equals( TEXT_ONLY ) )
242	{
243	textOnly = true;
244	}
245	else if( args[i].equals( IMAGES_ONLY ) )
246	{
247	imagesOnly = true;
248	}
249	else if( args[i].equals( TIME ) )
250	{
251	showTime = true;
252	}
253	else
254	{
255	if( pdfFile == null )
256	{
257	pdfFile = args[i];
258	}
259	}
260	}
261	if( pdfFile == null )
262	{
263	usage();
264	}
265	else
266	{
267	if(outputPrefix == null)
268	{
269	outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
270	}
271
272	PDDocument document = null;
273	try
274	{
275	boolean extractingTextAllowed = true;
276	//String outputFile = null;
277
278	/*startProcessing("Loading PDF "+pdfFile);
279	if( outputFile == null && pdfFile.length() >4 )
280	{
281	outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
282	}*/
283
284	document = PDDocument.load(new File(pdfFile), password);
285
286	AccessPermission ap = document.getCurrentAccessPermission();
287	if( ! ap.canExtractContent() )
288	{
289	//throw new IOException( "You do not have permission to extract text" );
290	System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
291	extractingTextAllowed = false;
292	}
293	//stopProcessing("Time for loading: ", startTime);
294
295	// don't extract to HTML in this class, just extract to txt
296	PDFTextStripper stripper = new PDFTextStripper();
297	//stripper.setSortByPosition( sort );
298	//stripper.setShouldSeparateByBeads( separateBeads );
299	stripper.setShouldSeparateByBeads( true );
300
301
302	ImageType imageType = null;
303	if ("bilevel".equalsIgnoreCase(color))
304	{
305	imageType = ImageType.BINARY;
306	}
307	else if ("gray".equalsIgnoreCase(color))
308	{
309	imageType = ImageType.GRAY;
310	}
311	else if ("rgb".equalsIgnoreCase(color))
312	{
313	imageType = ImageType.RGB;
314	}
315	else if ("rgba".equalsIgnoreCase(color))
316	{
317	imageType = ImageType.ARGB;
318	}
319
320	if (imageType == null)
321	{
322	System.err.println( "Error: Invalid color." );
323	System.exit( 2 );
324	}
325
326	//if a CropBox has been specified, update the CropBox:
327	//changeCropBoxes(PDDocument document,float a, float b, float c,float d)
328	if ( cropBoxLowerLeftX!=0 \|\| cropBoxLowerLeftY!=0
329	\|\| cropBoxUpperRightX!=0 \|\| cropBoxUpperRightY!=0 )
330	{
331	changeCropBox(document,
332	cropBoxLowerLeftX, cropBoxLowerLeftY,
333	cropBoxUpperRightX, cropBoxUpperRightY);
334	}
335
336	long startTime = System.nanoTime();
337
338	// render the pages
339	boolean success = true;
340	endPage = Math.min(endPage, document.getNumberOfPages());
341	PDFRenderer renderer = new PDFRenderer(document);
342	for (int i = startPage - 1; i < endPage; i++)
343	{
344	int lastSlash = outputPrefix.lastIndexOf(File.separator);
345	outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
346	String fileName = outputPrefix + (i + 1) + ".";
347
348	if(!textOnly) {
349	// turn page into image
350	BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
351	success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
352	}
353
354	// image version of page done, now extract text from current page
355	if(!imagesOnly && extractingTextAllowed) {
356	Writer output = null;
357	try {
358	output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
359	stripper.setStartPage( i+1 );
360	stripper.setEndPage( i+1 );
361
362	//if (debug)
363	//{
364	System.err.println("Writing to "+fileName);
365	//}
366
367	// Extract text for main document, the specified pages
368	stripper.writeText( document, output );
369	} catch (Exception ex) {
370	System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
371	} finally {
372	IOUtils.closeQuietly(output);
373	}
374	}
375	}
376
377	// GS NOTE: We just extracted text for (each page of) the main document, but
378	// we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
379
380	// performance stats
381	long endTime = System.nanoTime();
382	long duration = endTime - startTime;
383	int count = 1 + endPage - startPage;
384	if (showTime)
385	{
386	System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
387	duration / 1000000);
388	}
389
390	if (!success)
391	{
392	System.err.println( "Error: no writer found for image format '"
393	+ imageFormat + "'" );
394	System.exit(1);
395	}
396	}
397	finally
398	{
399	if( document != null )
400	{
401	document.close();
402	}
403	}
404	}
405	}
406
407	/**
408	* This will print the usage requirements and exit.
409	*/
410	private static void usage()
411	{
412	String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFBoxToImagesAndText [options] <inputfile>\n"
413	+ "\nOptions:\n"
414	+ " -password <password> : Password to decrypt document\n"
415	+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
416	+ " -format <string> : Image format: " + getImageFormats() + "\n"
417	+ " -prefix <string> : Filename prefix for image files\n"
418	+ " -page <number> : The only page to extract (1-based)\n"
419	+ " -startPage <int> : The first page to start extraction (1-based)\n"
420	+ " -endPage <int> : The last page to extract(inclusive)\n"
421	+ " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
422	+ " -dpi <int> : The DPI of the output image\n"
423	+ " -cropbox <int> <int> <int> <int> : The page area to export\n"
424	+ " -time : Prints timing information to stdout\n"
425	+ " <inputfile> : The PDF document to use\n";
426
427	System.err.println(message);
428	System.exit( 1 );
429	}
430
431	private static String getImageFormats()
432	{
433	StringBuilder retval = new StringBuilder();
434	String[] formats = ImageIO.getReaderFormatNames();
435	for( int i = 0; i < formats.length; i++ )
436	{
437	if (formats[i].equalsIgnoreCase(formats[i]))
438	{
439	retval.append( formats[i] );
440	if( i + 1 < formats.length )
441	{
442	retval.append( ", " );
443	}
444	}
445	}
446	return retval.toString();
447	}
448
449	private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
450	{
451	for (PDPage page : document.getPages())
452	{
453	System.out.println("resizing page");
454	PDRectangle rectangle = new PDRectangle();
455	rectangle.setLowerLeftX(a);
456	rectangle.setLowerLeftY(b);
457	rectangle.setUpperRightX(c);
458	rectangle.setUpperRightY(d);
459	page.setCropBox(rectangle);
460
461	}
462	}
463	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/pdf-box/trunk/java/src/org/greenstone/pdfbox/PDFBoxToImagesAndText.java@ 32278

Download in other formats: