Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

PDFBoxToImagesAndText.java@ 32197

Last change on this file since 32197 was 32197, checked in by ak19, 6 years ago

Updates to the recent commit's modifications to do with pdfbox: new class has been renamed from GS_PDFToImagesAndText.java to org/greenstone/pdfbox/PDFBoxToImagesAndText.java and uses a GS package. This class file is no longer included in pdfbox-app.jar, but is just compiled against that. Added Apache v 2.0 licensing related files. PDFBoxConverter.pm now refers to the newly named Java class with the new org.greenstone.pdfbox package name. Updated the Readme to add instructions to do with compiling the new java file and its new folder/package structure, and information related to the Apache license. There's also the new java/build subfolder containing the precompiled class file (and Java pkg structure) for the new class. This new build folder with the new custom class, and the modified PDFBoxConverter.pm and the modified pdfbox-app.jar (without the custom class) are modifications to the pdfbox tarball/zip files too.

File size: 15.7 KB

Line
1	/**********************************************************************
2	*
3	* PDFBoxToImagesAndText.java based on Apache PDFBoxÂ®'s PDFToImage.java
4	* with further code spliced in from its ExtractImages.java with some
5	* minor modifications.
6	*
7	* The code in this file is therefore under the same Apache License
8	* version 2.0 as Apache's PDFBox.
9	*
10	* Copyright 2018 The New Zealand Digital Library Project
11	*
12	* A component of the Greenstone digital library software
13	* from the New Zealand Digital Library Project at the
14	* University of Waikato, New Zealand.
15	*
16	* This program is free software; you can redistribute it and/or modify
17	* it under the terms of the Apache License version 2.0.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the Apache License version 2.0
25	* along with this file; if not, refer to
26	* https://www.apache.org/licenses/LICENSE-2.0.
27	*
28	* The following comment is from the original file,
29	* PDFBox's PDFToImage.java
30	*
31	*********************************************************************/
32	/*
33	* Licensed to the Apache Software Foundation (ASF) under one or more
34	* contributor license agreements. See the NOTICE file distributed with
35	* this work for additional information regarding copyright ownership.
36	* The ASF licenses this file to You under the Apache License, Version 2.0
37	* (the "License"); you may not use this file except in compliance with
38	* the License. You may obtain a copy of the License at
39	*
40	* http://www.apache.org/licenses/LICENSE-2.0
41	*
42	* Unless required by applicable law or agreed to in writing, software
43	* distributed under the License is distributed on an "AS IS" BASIS,
44	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
45	* See the License for the specific language governing permissions and
46	* limitations under the License.
47	*/
48	package org.greenstone.pdfbox;
49
50	import java.awt.HeadlessException;
51	import java.awt.Toolkit;
52	import java.awt.image.BufferedImage;
53	import java.io.File;
54	import java.io.FileOutputStream;
55	import java.io.IOException;
56	import java.io.OutputStreamWriter;
57	import java.io.Writer;
58	import org.apache.pdfbox.io.IOUtils;
59
60	import javax.imageio.ImageIO;
61
62	import org.apache.pdfbox.pdmodel.PDDocument;
63	import org.apache.pdfbox.pdmodel.PDPage;
64	import org.apache.pdfbox.pdmodel.common.PDRectangle;
65	import org.apache.pdfbox.rendering.ImageType;
66	import org.apache.pdfbox.rendering.PDFRenderer;
67	import org.apache.pdfbox.tools.imageio.ImageIOUtil;
68
69	import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
70	import org.apache.pdfbox.text.PDFTextStripper;
71
72	/**
73	* This class is based on PDFToImage.java which converts
74	* the pages of a PDF document to images.
75	* This class should convert the pages to images and
76	* extract the text of each page. That part of the code
77	* to be taken from ExtractText.java
78	*
79	* Built on Apache PDFBox's PDFToImage.java with minor modifications.
80	* ak19
81	*/
82	public final class PDFBoxToImagesAndText
83	{
84	private static final String PASSWORD = "-password";
85	private static final String ENCODING = "-encoding";
86	private static final String START_PAGE = "-startPage";
87	private static final String END_PAGE = "-endPage";
88	private static final String PAGE = "-page";
89	private static final String IMAGE_TYPE = "-imageType";
90	private static final String FORMAT = "-format";
91	private static final String OUTPUT_PREFIX = "-outputPrefix";
92	private static final String PREFIX = "-prefix";
93	private static final String COLOR = "-color";
94	private static final String RESOLUTION = "-resolution";
95	private static final String DPI = "-dpi";
96	private static final String CROPBOX = "-cropbox";
97	private static final String TIME = "-time";
98
99	private static final String STD_ENCODING = "UTF-8";
100
101	/**
102	* private constructor.
103	*/
104	private PDFBoxToImagesAndText()
105	{
106	//static class
107	}
108
109	/**
110	* Infamous main method.
111	*
112	* @param args Command line arguments, should be one and a reference to a file.
113	*
114	* @throws IOException If there is an error parsing the document.
115	*/
116	public static void main( String[] args ) throws IOException
117	{
118	try
119	{
120	// force KCMS (faster than LCMS) if available
121	Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
122	System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
123	}
124	catch (ClassNotFoundException e)
125	{
126	// do nothing
127	}
128
129	// suppress the Dock icon on OS X
130	System.setProperty("apple.awt.UIElement", "true");
131
132	String password = "";
133	String encoding = STD_ENCODING;
134	String pdfFile = null;
135	String outputPrefix = null;
136	String imageFormat = "jpg";
137	int startPage = 1;
138	int endPage = Integer.MAX_VALUE;
139	String color = "rgb";
140	int dpi;
141	float cropBoxLowerLeftX = 0;
142	float cropBoxLowerLeftY = 0;
143	float cropBoxUpperRightX = 0;
144	float cropBoxUpperRightY = 0;
145	boolean showTime = false;
146	try
147	{
148	dpi = Toolkit.getDefaultToolkit().getScreenResolution();
149	}
150	catch( HeadlessException e )
151	{
152	dpi = 96;
153	}
154	for( int i = 0; i < args.length; i++ )
155	{
156	if( args[i].equals( PASSWORD ) )
157	{
158	i++;
159	if( i >= args.length )
160	{
161	usage();
162	}
163	password = args[i];
164	}
165	else if( args[i].equals( ENCODING ) )
166	{
167	i++;
168	if( i >= args.length )
169	{
170	usage();
171	}
172	encoding = args[i];
173	}
174	else if( args[i].equals( START_PAGE ) )
175	{
176	i++;
177	if( i >= args.length )
178	{
179	usage();
180	}
181	startPage = Integer.parseInt( args[i] );
182	}
183	else if( args[i].equals( END_PAGE ) )
184	{
185	i++;
186	if( i >= args.length )
187	{
188	usage();
189	}
190	endPage = Integer.parseInt( args[i] );
191	}
192	else if( args[i].equals( PAGE ) )
193	{
194	i++;
195	if( i >= args.length )
196	{
197	usage();
198	}
199	startPage = Integer.parseInt( args[i] );
200	endPage = Integer.parseInt( args[i] );
201	}
202	else if( args[i].equals(IMAGE_TYPE) \|\| args[i].equals(FORMAT) )
203	{
204	i++;
205	imageFormat = args[i];
206	}
207	else if( args[i].equals( OUTPUT_PREFIX ) \|\| args[i].equals( PREFIX ) )
208	{
209	i++;
210	outputPrefix = args[i];
211	}
212	else if( args[i].equals( COLOR ) )
213	{
214	i++;
215	color = args[i];
216	}
217	else if( args[i].equals( RESOLUTION ) \|\| args[i].equals( DPI ) )
218	{
219	i++;
220	dpi = Integer.parseInt(args[i]);
221	}
222	else if( args[i].equals( CROPBOX ) )
223	{
224	i++;
225	cropBoxLowerLeftX = Float.valueOf(args[i]);
226	i++;
227	cropBoxLowerLeftY = Float.valueOf(args[i]);
228	i++;
229	cropBoxUpperRightX = Float.valueOf(args[i]);
230	i++;
231	cropBoxUpperRightY = Float.valueOf(args[i]);
232	}
233	else if( args[i].equals( TIME ) )
234	{
235	showTime = true;
236	}
237	else
238	{
239	if( pdfFile == null )
240	{
241	pdfFile = args[i];
242	}
243	}
244	}
245	if( pdfFile == null )
246	{
247	usage();
248	}
249	else
250	{
251	if(outputPrefix == null)
252	{
253	outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
254	}
255
256	PDDocument document = null;
257	try
258	{
259	boolean extractingTextAllowed = true;
260	//String outputFile = null;
261
262	/*startProcessing("Loading PDF "+pdfFile);
263	if( outputFile == null && pdfFile.length() >4 )
264	{
265	outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
266	}*/
267
268	document = PDDocument.load(new File(pdfFile), password);
269
270	AccessPermission ap = document.getCurrentAccessPermission();
271	if( ! ap.canExtractContent() )
272	{
273	//throw new IOException( "You do not have permission to extract text" );
274	System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
275	extractingTextAllowed = false;
276	}
277	//stopProcessing("Time for loading: ", startTime);
278
279	// don't extract to HTML in this class, just extract to txt
280	PDFTextStripper stripper = new PDFTextStripper();
281	//stripper.setSortByPosition( sort );
282	//stripper.setShouldSeparateByBeads( separateBeads );
283	stripper.setShouldSeparateByBeads( true );
284
285
286	ImageType imageType = null;
287	if ("bilevel".equalsIgnoreCase(color))
288	{
289	imageType = ImageType.BINARY;
290	}
291	else if ("gray".equalsIgnoreCase(color))
292	{
293	imageType = ImageType.GRAY;
294	}
295	else if ("rgb".equalsIgnoreCase(color))
296	{
297	imageType = ImageType.RGB;
298	}
299	else if ("rgba".equalsIgnoreCase(color))
300	{
301	imageType = ImageType.ARGB;
302	}
303
304	if (imageType == null)
305	{
306	System.err.println( "Error: Invalid color." );
307	System.exit( 2 );
308	}
309
310	//if a CropBox has been specified, update the CropBox:
311	//changeCropBoxes(PDDocument document,float a, float b, float c,float d)
312	if ( cropBoxLowerLeftX!=0 \|\| cropBoxLowerLeftY!=0
313	\|\| cropBoxUpperRightX!=0 \|\| cropBoxUpperRightY!=0 )
314	{
315	changeCropBox(document,
316	cropBoxLowerLeftX, cropBoxLowerLeftY,
317	cropBoxUpperRightX, cropBoxUpperRightY);
318	}
319
320	long startTime = System.nanoTime();
321
322	// render the pages
323	boolean success = true;
324	endPage = Math.min(endPage, document.getNumberOfPages());
325	PDFRenderer renderer = new PDFRenderer(document);
326	for (int i = startPage - 1; i < endPage; i++)
327	{
328	// turn page into image
329	BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);
330	int lastSlash = outputPrefix.lastIndexOf(File.separator);
331	outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
332	String fileName = outputPrefix + (i + 1) + ".";
333	success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
334
335
336	// image version of page done, now extract text from current page
337	if(extractingTextAllowed) {
338	Writer output = null;
339	try {
340	output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
341	stripper.setStartPage( i+1 );
342	stripper.setEndPage( i+1 );
343
344	//if (debug)
345	//{
346	System.err.println("Writing to "+fileName);
347	//}
348
349	// Extract text for main document, the specified pages
350	stripper.writeText( document, output );
351	} catch (Exception ex) {
352	System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
353	} finally {
354	IOUtils.closeQuietly(output);
355	}
356	}
357	}
358
359	// GS NOTE: We just extracted text for (each page of) the main document, but
360	// we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java
361
362	// performance stats
363	long endTime = System.nanoTime();
364	long duration = endTime - startTime;
365	int count = 1 + endPage - startPage;
366	if (showTime)
367	{
368	System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
369	duration / 1000000);
370	}
371
372	if (!success)
373	{
374	System.err.println( "Error: no writer found for image format '"
375	+ imageFormat + "'" );
376	System.exit(1);
377	}
378	}
379	finally
380	{
381	if( document != null )
382	{
383	document.close();
384	}
385	}
386	}
387	}
388
389	/**
390	* This will print the usage requirements and exit.
391	*/
392	private static void usage()
393	{
394	String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFBoxToImagesAndText [options] <inputfile>\n"
395	+ "\nOptions:\n"
396	+ " -password <password> : Password to decrypt document\n"
397	+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
398	+ " -format <string> : Image format: " + getImageFormats() + "\n"
399	+ " -prefix <string> : Filename prefix for image files\n"
400	+ " -page <number> : The only page to extract (1-based)\n"
401	+ " -startPage <int> : The first page to start extraction (1-based)\n"
402	+ " -endPage <int> : The last page to extract(inclusive)\n"
403	+ " -color <int> : The color depth (valid: bilevel, gray, rgb, rgba)\n"
404	+ " -dpi <int> : The DPI of the output image\n"
405	+ " -cropbox <int> <int> <int> <int> : The page area to export\n"
406	+ " -time : Prints timing information to stdout\n"
407	+ " <inputfile> : The PDF document to use\n";
408
409	System.err.println(message);
410	System.exit( 1 );
411	}
412
413	private static String getImageFormats()
414	{
415	StringBuilder retval = new StringBuilder();
416	String[] formats = ImageIO.getReaderFormatNames();
417	for( int i = 0; i < formats.length; i++ )
418	{
419	if (formats[i].equalsIgnoreCase(formats[i]))
420	{
421	retval.append( formats[i] );
422	if( i + 1 < formats.length )
423	{
424	retval.append( ", " );
425	}
426	}
427	}
428	return retval.toString();
429	}
430
431	private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
432	{
433	for (PDPage page : document.getPages())
434	{
435	System.out.println("resizing page");
436	PDRectangle rectangle = new PDRectangle();
437	rectangle.setLowerLeftX(a);
438	rectangle.setLowerLeftY(b);
439	rectangle.setUpperRightX(c);
440	rectangle.setUpperRightY(d);
441	page.setCropBox(rectangle);
442
443	}
444	}
445	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/pdf-box/trunk/java/src/org/greenstone/pdfbox/PDFBoxToImagesAndText.java@ 32197

Download in other formats: