Changeset 31335 for other-projects/hathitrust
- Timestamp:
- 2017-01-23T20:37:32+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/vol-checker/src/org/hathitrust/extractedfeatures/VolumeCheck.java
r31334 r31335 27 27 28 28 protected static int HASHMAP_INIT_SIZE = 13800000; 29 //protected static String DATA_DIR="D:/cygwin64/home/davidb/research/code-managed/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest"; 30 protected static HashMap<String,String> id_check_ = null; 29 protected static HashMap<String,Boolean> id_check_ = null; 31 30 32 33 31 public VolumeCheck() { 34 32 35 33 } 36 34 37 protected static final String opt_file_ext = ".json.bz2";38 39 protected String full_filename_to_tail(String full_filename)35 protected static final String file_ext = ".json.bz2"; 36 37 protected static String full_filename_to_tail(String full_filename) 40 38 { 41 39 String filename_tail = full_filename.substring(full_filename.lastIndexOf("/")+1); … … 43 41 } 44 42 45 protected String filename_tail_to_id(String filename_tail)43 protected static String filename_tail_to_id(String filename_tail) 46 44 { 47 45 String id = null; 48 if (filename_tail.endsWith( opt_file_ext)) {49 id = filename_tail.substring(0,filename_tail.lastIndexOf( opt_file_ext));46 if (filename_tail.endsWith(file_ext)) { 47 id = filename_tail.substring(0,filename_tail.lastIndexOf(file_ext)); 50 48 } 51 49 else { … … 56 54 57 55 return id; 56 } 57 58 protected static String id_to_pairtree_filename(String id) { 59 // Example :- 60 // id: miun.adx6300.0001.001 61 // pairtree filename: miun/pairtree_root/ad/x6/30/0,/00/01/,0/01/adx6300,0001,001/miun.adx6300,0001,001.json.bz2 62 63 // 1. Map 'difficult' chars: 64 // . => , 65 // : => + 66 // / => = 67 68 // 2. Process resulting string: 69 // split on first dot 70 // add "pairtree_root" 71 // then split everything else 2 chars at a time 72 73 // 3. Finally add in the (safely transformed) id: 74 // append directory that is 'id' 75 // further append 'id'.json.bz 76 77 78 String id_safe = id.replaceAll("\\.", ",").replaceAll(":", "+").replaceAll("/", "="); 79 80 int id_dot_pos = id_safe.indexOf("."); 81 String id_prefix = id_safe.substring(0,id_dot_pos); 82 String id_tail = id_safe.substring(id_dot_pos+1); 83 84 String [] pairs = id_tail.split("(?<=\\G..)"); 85 String joined_pairs = String.join("/", pairs); 86 87 String main_dir = id_prefix + "/pairtree_root/" + joined_pairs; 88 String filename = main_dir + "/" + id_safe + "/" + id_safe + file_ext; 89 90 return filename; 58 91 } 59 92 … … 62 95 long line_num = 1; 63 96 String line; 64 65 66 97 67 98 try { … … 74 105 String id = filename_tail_to_id(json_filename_tail); 75 106 76 id_check_.put(id, full_json_filename); 77 78 107 id_check_.put(id, true); 108 79 109 if ((line_num % 100000) == 0) { 80 110 //System.err.println("sample id = " + id); … … 99 129 100 130 if (id_check_ == null) { 101 id_check_ = new HashMap<String, String>(HASHMAP_INIT_SIZE);131 id_check_ = new HashMap<String,Boolean>(HASHMAP_INIT_SIZE); 102 132 103 133 String htrc_list_file = "htrc-ef-all-files.txt"; … … 185 215 String id = ids[i]; 186 216 187 String full_json_filename = id_check_.get(id); 188 boolean status = (full_json_filename != null); 217 boolean exists = id_check_.get(id); 189 218 190 219 if (i>0) { 191 220 pw.append(","); 192 221 } 193 pw.append("\"" + id + "\":" + status );222 pw.append("\"" + id + "\":" + exists ); 194 223 } 195 224 pw.append("}"); … … 200 229 201 230 String id = cgi_id; 202 String full_json_filename = id_check_.get(id); 203 boolean status = (full_json_filename != null); 204 pw.append("{'" + id + "':" + status + "}"); 231 boolean exists = id_check_.get(id); 232 pw.append("{'" + id + "':" + exists + "}"); 205 233 } 206 234 else if (cgi_download_id != null) { 207 235 String download_id = cgi_download_id; 208 String full_json_filename = id_check_.get(download_id); 209 boolean exists = (full_json_filename != null); 236 boolean exists = id_check_.get(download_id); 210 237 if (!exists) { 211 238 // Error … … 214 241 else { 215 242 // rsync -av data.analytics.hathitrust.org::features/{PATH-TO-FILE} . 216 243 String full_json_filename = id_to_pairtree_filename(download_id); 244 217 245 doRsyncDownload(full_json_filename); 218 246
Note:
See TracChangeset
for help on using the changeset viewer.