1 | package org.atea.nlptools.ocr.servlets;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.io.FileInputStream;
|
---|
5 | import java.io.IOException;
|
---|
6 | import java.lang.reflect.Type;
|
---|
7 | import java.util.ArrayList;
|
---|
8 | import java.util.Collection;
|
---|
9 | import java.util.HashMap;
|
---|
10 | import java.util.List;
|
---|
11 | import java.util.Properties;
|
---|
12 | import java.util.UUID;
|
---|
13 |
|
---|
14 | import javax.servlet.ServletException;
|
---|
15 | import javax.servlet.annotation.MultipartConfig;
|
---|
16 | import javax.servlet.http.HttpServlet;
|
---|
17 | import javax.servlet.http.HttpServletRequest;
|
---|
18 | import javax.servlet.http.HttpServletResponse;
|
---|
19 | import javax.servlet.http.Part;
|
---|
20 |
|
---|
21 | import com.google.gson.Gson;
|
---|
22 | import com.google.gson.reflect.TypeToken;
|
---|
23 | import com.google.gson.stream.JsonWriter;
|
---|
24 |
|
---|
25 | import org.apache.logging.log4j.LogManager;
|
---|
26 | import org.apache.logging.log4j.Logger;
|
---|
27 |
|
---|
28 | import org.atea.nlptools.ocr.abstractions.objects.HttpStatusCode;
|
---|
29 | import org.atea.nlptools.ocr.abstractions.objects.IOcrFile;
|
---|
30 | import org.atea.nlptools.ocr.abstractions.objects.IOcrOptions;
|
---|
31 | import org.atea.nlptools.ocr.abstractions.services.IOcrService;
|
---|
32 | import org.atea.nlptools.ocr.objects.OcrFile;
|
---|
33 | import org.atea.nlptools.ocr.objects.OcrOptionsDto;
|
---|
34 | import org.atea.nlptools.ocr.objects.TesseractOptions;
|
---|
35 | import org.atea.nlptools.ocr.services.TesseractOcrService;
|
---|
36 | import org.bytedeco.tesseract.global.tesseract;
|
---|
37 |
|
---|
38 | @MultipartConfig
|
---|
39 | public class TesseractServlet extends HttpServlet
|
---|
40 | {
|
---|
41 | /**
|
---|
42 | * Gets the maximum number of parts that may be submitted in any one request.
|
---|
43 | */
|
---|
44 | private static final int MaxParts = 10;
|
---|
45 |
|
---|
46 | /**
|
---|
47 | * Gets the maximum size in bytes that a part may be.
|
---|
48 | */
|
---|
49 | private static final int MaxPartSize = 1024 * 1024 * 50;
|
---|
50 |
|
---|
51 | private static final Logger logger = LogManager.getLogger(TesseractServlet.class);
|
---|
52 | private static final Type optionsTypeToken = new TypeToken<HashMap<String, OcrOptionsDto>>() {}.getType();
|
---|
53 |
|
---|
54 | private final Gson gsonInstance;
|
---|
55 |
|
---|
56 | private IOcrService ocrService;
|
---|
57 |
|
---|
58 | public TesseractServlet()
|
---|
59 | {
|
---|
60 | gsonInstance = new Gson();
|
---|
61 | }
|
---|
62 |
|
---|
63 | @Override
|
---|
64 | public void init()
|
---|
65 | {
|
---|
66 | Properties prop = new Properties();
|
---|
67 |
|
---|
68 | try
|
---|
69 | {
|
---|
70 | FileInputStream fis = new FileInputStream("../webapps/gs3-atea-ocr/config.properties");
|
---|
71 | prop.load(fis);
|
---|
72 | fis.close();
|
---|
73 | }
|
---|
74 | catch (IOException ex)
|
---|
75 | {
|
---|
76 | logger.error("Failed to retrieve properties file", ex);
|
---|
77 | }
|
---|
78 |
|
---|
79 | String tessBin = prop.getProperty("tesseract.data.path");
|
---|
80 | this.ocrService = new TesseractOcrService(tessBin);
|
---|
81 |
|
---|
82 | logger.info("Initialised!");
|
---|
83 | }
|
---|
84 |
|
---|
85 | @Override
|
---|
86 | protected void doGet(HttpServletRequest request, HttpServletResponse response)
|
---|
87 | throws IOException
|
---|
88 | {
|
---|
89 | response.sendError(HttpStatusCode.ClientForbidden, "POST Multipart request expected.");
|
---|
90 | }
|
---|
91 |
|
---|
92 | @Override
|
---|
93 | protected void doPost(HttpServletRequest request, HttpServletResponse response)
|
---|
94 | throws ServletException, IOException
|
---|
95 | {
|
---|
96 | try
|
---|
97 | {
|
---|
98 | logger.trace("POST request received.");
|
---|
99 | final UUID requestUUID = UUID.randomUUID();
|
---|
100 |
|
---|
101 | final TesseractOptions genericOptions = new TesseractOptions();
|
---|
102 | HashMap<String, IOcrOptions> options = ParseOptions(request);
|
---|
103 |
|
---|
104 | final List<IOcrFile> files = CreatePartTempFiles(request.getParts(), requestUUID, response);
|
---|
105 | if (files == null) {
|
---|
106 | return;
|
---|
107 | }
|
---|
108 |
|
---|
109 | response.setContentType("application/json; charset=UTF-8");
|
---|
110 | JsonWriter writer = gsonInstance.newJsonWriter(response.getWriter());
|
---|
111 | writer.beginArray();
|
---|
112 |
|
---|
113 | for (IOcrFile file : files)
|
---|
114 | {
|
---|
115 | IOcrOptions actualOptions = options.get(file.getKey());
|
---|
116 | if (actualOptions == null) {
|
---|
117 | actualOptions = genericOptions;
|
---|
118 | }
|
---|
119 |
|
---|
120 | String ocrOutput = ocrService.run(file.getTempFile(), actualOptions);
|
---|
121 |
|
---|
122 | writer.beginObject();
|
---|
123 |
|
---|
124 | writer.name("key");
|
---|
125 | writer.value(file.getKey());
|
---|
126 |
|
---|
127 | writer.name("fileName");
|
---|
128 | writer.value(file.getFileName());
|
---|
129 |
|
---|
130 | writer.name("text");
|
---|
131 | writer.value(ocrOutput);
|
---|
132 |
|
---|
133 | writer.endObject();
|
---|
134 | }
|
---|
135 |
|
---|
136 | writer.endArray();
|
---|
137 | writer.flush();
|
---|
138 | }
|
---|
139 | catch (Exception ex)
|
---|
140 | {
|
---|
141 | response.sendError(HttpStatusCode.ServerInternalServerError, "Failed to process the request.");
|
---|
142 | logger.error("Failed to complete API call", ex);
|
---|
143 | }
|
---|
144 | }
|
---|
145 |
|
---|
146 | /**
|
---|
147 | * Saves the parts in a request to temporary files.
|
---|
148 | * @param parts The parts.
|
---|
149 | * @param requestUUID The assigned UUID of the request.
|
---|
150 | * @param response The response.
|
---|
151 | * @return The list of temporary files, or <code>null</code> if the operation failed.
|
---|
152 | * @throws IOException
|
---|
153 | * @throws ServletException
|
---|
154 | */
|
---|
155 | private List<IOcrFile> CreatePartTempFiles(Collection<Part> parts, UUID requestUUID, HttpServletResponse response)
|
---|
156 | throws IOException, ServletException
|
---|
157 | {
|
---|
158 | if (parts.size() > MaxParts)
|
---|
159 | {
|
---|
160 | response.sendError(HttpStatusCode.ClientPayloadTooLarge, "No more than " + MaxParts + " parts may be submitted.");
|
---|
161 | return null;
|
---|
162 | }
|
---|
163 |
|
---|
164 | ArrayList<IOcrFile> files = new ArrayList<>();
|
---|
165 |
|
---|
166 | for (Part p : parts)
|
---|
167 | {
|
---|
168 | if (p.getSubmittedFileName() == null) {
|
---|
169 | continue;
|
---|
170 | }
|
---|
171 |
|
---|
172 | if (p.getSize() > MaxPartSize)
|
---|
173 | {
|
---|
174 | response.sendError(HttpStatusCode.ClientPayloadTooLarge, "A submitted part must be no more than " + MaxPartSize + " bytes");
|
---|
175 | return null;
|
---|
176 | }
|
---|
177 |
|
---|
178 | File tempFile = File.createTempFile("atea-ocr", requestUUID.toString());
|
---|
179 | p.write(tempFile.getAbsolutePath());
|
---|
180 |
|
---|
181 | files.add(new OcrFile(p.getName(), p.getSubmittedFileName(), tempFile));
|
---|
182 | }
|
---|
183 |
|
---|
184 | return files;
|
---|
185 | }
|
---|
186 |
|
---|
187 | /**
|
---|
188 | * Parses the options parameter.
|
---|
189 | * @param request The request.
|
---|
190 | * @return The submitted options.
|
---|
191 | */
|
---|
192 | private HashMap<String, IOcrOptions> ParseOptions(HttpServletRequest request)
|
---|
193 | {
|
---|
194 | HashMap<String, IOcrOptions> ret = new HashMap<>();
|
---|
195 |
|
---|
196 | String options = request.getParameter("options");
|
---|
197 | if (options == null) {
|
---|
198 | return ret;
|
---|
199 | }
|
---|
200 |
|
---|
201 | HashMap<String, OcrOptionsDto> dtoOptions = gsonInstance.fromJson(options, optionsTypeToken);
|
---|
202 |
|
---|
203 | for (String key : dtoOptions.keySet())
|
---|
204 | {
|
---|
205 | OcrOptionsDto dto = dtoOptions.get(key);
|
---|
206 | TesseractOptions tOptions = new TesseractOptions();
|
---|
207 |
|
---|
208 | if (dto.layoutDetection) {
|
---|
209 | tOptions.pageSegmentationMode = tesseract.PSM_AUTO_OSD;
|
---|
210 | }
|
---|
211 |
|
---|
212 | ret.put(key, tOptions);
|
---|
213 | }
|
---|
214 |
|
---|
215 | return ret;
|
---|
216 | }
|
---|
217 | }
|
---|