source: other-projects/hathitrust/wcsa/vol-checker/src/org/hathitrust/extractedfeatures/VolumeCheck.java@ 31335

Last change on this file since 31335 was 31335, checked in by davidb, 7 years ago

Too expensive to hold pairtree filename in hashmap, so change to computing on-the-fly from 'id'

  • Property svn:executable set to *
File size: 7.5 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedReader;
5import java.io.FileReader;
6import java.io.IOException;
7import java.io.InputStream;
8import java.io.InputStreamReader;
9import java.io.PrintWriter;
10import java.io.UnsupportedEncodingException;
11import java.util.ArrayList;
12import java.util.HashMap;
13
14import javax.servlet.ServletConfig;
15import javax.servlet.ServletException;
16import javax.servlet.annotation.WebServlet;
17import javax.servlet.http.HttpServlet;
18import javax.servlet.http.HttpServletRequest;
19import javax.servlet.http.HttpServletResponse;
20
21/**
22 * Servlet implementation class VolumeCheck
23 */
24@WebServlet("/VolumeCheck")
25public class VolumeCheck extends HttpServlet {
26 private static final long serialVersionUID = 1L;
27
28 protected static int HASHMAP_INIT_SIZE = 13800000;
29 protected static HashMap<String,Boolean> id_check_ = null;
30
31 public VolumeCheck() {
32
33 }
34
35 protected static final String file_ext = ".json.bz2";
36
37 protected static String full_filename_to_tail(String full_filename)
38 {
39 String filename_tail = full_filename.substring(full_filename.lastIndexOf("/")+1);
40 return filename_tail;
41 }
42
43 protected static String filename_tail_to_id(String filename_tail)
44 {
45 String id = null;
46 if (filename_tail.endsWith(file_ext)) {
47 id = filename_tail.substring(0,filename_tail.lastIndexOf(file_ext));
48 }
49 else {
50 id = filename_tail;
51 }
52
53 id = id.replaceAll("\\+", ":").replaceAll("=", "/");
54
55 return id;
56 }
57
58 protected static String id_to_pairtree_filename(String id) {
59 // Example :-
60 // id: miun.adx6300.0001.001
61 // pairtree filename: miun/pairtree_root/ad/x6/30/0,/00/01/,0/01/adx6300,0001,001/miun.adx6300,0001,001.json.bz2
62
63 // 1. Map 'difficult' chars:
64 // . => ,
65 // : => +
66 // / => =
67
68 // 2. Process resulting string:
69 // split on first dot
70 // add "pairtree_root"
71 // then split everything else 2 chars at a time
72
73 // 3. Finally add in the (safely transformed) id:
74 // append directory that is 'id'
75 // further append 'id'.json.bz
76
77
78 String id_safe = id.replaceAll("\\.", ",").replaceAll(":", "+").replaceAll("/", "=");
79
80 int id_dot_pos = id_safe.indexOf(".");
81 String id_prefix = id_safe.substring(0,id_dot_pos);
82 String id_tail = id_safe.substring(id_dot_pos+1);
83
84 String [] pairs = id_tail.split("(?<=\\G..)");
85 String joined_pairs = String.join("/", pairs);
86
87 String main_dir = id_prefix + "/pairtree_root/" + joined_pairs;
88 String filename = main_dir + "/" + id_safe + "/" + id_safe + file_ext;
89
90 return filename;
91 }
92
93 protected void storeIDs(BufferedReader br)
94 {
95 long line_num = 1;
96 String line;
97
98 try {
99
100 System.err.print("Loading hashmap: ");
101 while ((line = br.readLine()) != null) {
102
103 String full_json_filename = line;
104 String json_filename_tail = full_filename_to_tail(full_json_filename);
105 String id = filename_tail_to_id(json_filename_tail);
106
107 id_check_.put(id, true);
108
109 if ((line_num % 100000) == 0) {
110 //System.err.println("sample id = " + id);
111 //System.err.println("Passed line: " + line_num);
112 System.err.print(".");
113 }
114 line_num++;
115
116 }
117 System.err.println(" => done.");
118 }
119 catch (Exception e) {
120 e.printStackTrace();
121 }
122
123 }
124 /**
125 * @see Servlet#init(ServletConfig)
126 */
127 public void init(ServletConfig config) throws ServletException {
128 super.init(config);
129
130 if (id_check_ == null) {
131 id_check_ = new HashMap<String,Boolean>(HASHMAP_INIT_SIZE);
132
133 String htrc_list_file = "htrc-ef-all-files.txt";
134 InputStream is = getServletContext().getResourceAsStream("/WEB-INF/" + htrc_list_file);
135
136 try {
137 System.err.println("INFO: Loading in volume IDS: " + htrc_list_file);
138
139 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
140 BufferedReader br = new BufferedReader(isr);
141
142 storeIDs(br);
143 br.close();
144 }
145 catch (Exception e) {
146 e.printStackTrace();
147 }
148 }
149 }
150
151 protected void doRsyncDownload(String full_json_filename)
152 {
153 String json_filename_tail = full_filename_to_tail(full_json_filename);
154 //String cmd = "rsync -av data.analytics.hathitrust.org::features/" + full_json_filename + ".";
155
156
157 Runtime rt = Runtime.getRuntime();
158 String[] command = {"rsync","-av","data.analytics.hathitrust.org::features/" + full_json_filename, "."};
159
160
161 try {
162 Process proc = rt.exec(command);
163
164 /*
165 BufferedReader stdInput = new BufferedReader(new
166 InputStreamReader(proc.getInputStream()));
167
168 BufferedReader stdError = new BufferedReader(new
169 InputStreamReader(proc.getErrorStream()));
170
171 // read the output from the command
172 System.out.println("Here is the standard output of the command:\n");
173 String s = null;
174 while ((s = stdInput.readLine()) != null) {
175 System.out.println(s);
176 }
177
178 // read any errors from the attempted command
179 System.out.println("Here is the standard error of the command (if any):\n");
180 while ((s = stdError.readLine()) != null) {
181 System.out.println(s);
182 }
183 */
184
185 proc.waitFor();
186 System.err.println("*** Rsync finished");
187
188 //System.out.println("Done.");
189
190 }
191 catch (Exception e) {
192 e.printStackTrace();
193 }
194
195 }
196 /**
197 * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
198 */
199 protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
200 PrintWriter pw = response.getWriter();
201
202 String cgi_ids = request.getParameter("ids");
203 String cgi_id = request.getParameter("id");
204 String cgi_download_id = request.getParameter("download-id");
205
206 if (cgi_ids != null) {
207 response.setContentType("application/json");
208
209 String[] ids = cgi_ids.split(",");
210 int ids_len = ids.length;
211
212 pw.append("{");
213
214 for (int i=0; i<ids_len; i++) {
215 String id = ids[i];
216
217 boolean exists = id_check_.get(id);
218
219 if (i>0) {
220 pw.append(",");
221 }
222 pw.append("\"" + id + "\":" + exists );
223 }
224 pw.append("}");
225
226 }
227 else if (cgi_id != null) {
228 response.setContentType("application/json");
229
230 String id = cgi_id;
231 boolean exists = id_check_.get(id);
232 pw.append("{'" + id + "':" + exists + "}");
233 }
234 else if (cgi_download_id != null) {
235 String download_id = cgi_download_id;
236 boolean exists = id_check_.get(download_id);
237 if (!exists) {
238 // Error
239 response.sendError(HttpServletResponse.SC_BAD_REQUEST, "The requested volume id does not exist.");
240 }
241 else {
242 // rsync -av data.analytics.hathitrust.org::features/{PATH-TO-FILE} .
243 String full_json_filename = id_to_pairtree_filename(download_id);
244
245 doRsyncDownload(full_json_filename);
246
247 }
248 }
249 else {
250
251 pw.append("General Info: Number of HTRC Volumes in check-list = " + id_check_.size());
252
253 }
254 //pw.close();
255
256 }
257
258 /**
259 * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
260 */
261 protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
262 doGet(request, response);
263 }
264
265}
Note: See TracBrowser for help on using the repository browser.