Changeset 31296
- Timestamp:
- 2017-01-18T17:54:56+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/vol-checker/src/org/hathitrust/extractedfeatures/VolumeCheck.java
r31279 r31296 1 1 package org.hathitrust.extractedfeatures; 2 2 3 import java.io.BufferedInputStream; 3 4 import java.io.BufferedReader; 4 5 import java.io.FileReader; 5 6 import java.io.IOException; 7 import java.io.InputStream; 8 import java.io.InputStreamReader; 6 9 import java.io.PrintWriter; 10 import java.io.UnsupportedEncodingException; 7 11 import java.util.ArrayList; 8 12 import java.util.HashMap; … … 28 32 29 33 public VolumeCheck() { 30 if (id_check_ == null) { 31 id_check_ = new HashMap<String,Boolean>(HASHMAP_INIT_SIZE); 32 storeIDs(DATA_DIR + "/full-listing.txt"); 33 } 34 34 35 } 35 36 36 protected void storeIDs( String filename)37 protected void storeIDs(BufferedReader br) 37 38 { 39 long line_num = 1; 40 String line; 41 42 String opt_file_ext = ".json.bz2"; 43 38 44 try { 39 FileReader fr = new FileReader(filename); 40 BufferedReader br = new BufferedReader(fr); 41 42 long line_num = 1; 43 String line; 44 System.err.println("Loading in volume IDS: " + filename); 45 46 String opt_file_ext = ".json.bz2"; 47 48 while ((line = br.readLine()) != null) { 49 50 String file_id = line.substring(line.lastIndexOf("/")+1); 51 String id = null; 52 if (file_id.endsWith(opt_file_ext)) { 53 id = file_id.substring(0,file_id.lastIndexOf(opt_file_ext)); 54 } 55 else { 56 id = file_id; 57 } 58 59 id = id.replaceAll("\\+", ":").replaceAll("=", "/"); 60 61 id_check_.put(id, true); 62 63 if ((line_num % 100000) == 0) { 64 System.err.println("sample id = " + id); 65 System.err.println("Passed line: " + line_num); 66 } 67 line_num++; 68 69 } 70 71 br.close(); 45 46 while ((line = br.readLine()) != null) { 47 48 String file_id = line.substring(line.lastIndexOf("/")+1); 49 String id = null; 50 if (file_id.endsWith(opt_file_ext)) { 51 id = file_id.substring(0,file_id.lastIndexOf(opt_file_ext)); 52 } 53 else { 54 id = file_id; 55 } 56 57 id = id.replaceAll("\\+", ":").replaceAll("=", "/"); 58 59 id_check_.put(id, true); 60 61 if ((line_num % 100000) == 0) { 62 System.err.println("sample id = " + id); 63 System.err.println("Passed line: " + line_num); 64 } 65 line_num++; 66 67 } 72 68 } 73 69 catch (Exception e) { … … 80 76 */ 81 77 public void init(ServletConfig config) throws ServletException { 82 // currently everything we need to happen at start up can be done in the constructor 78 super.init(config); 79 80 if (id_check_ == null) { 81 id_check_ = new HashMap<String,Boolean>(HASHMAP_INIT_SIZE); 82 83 String htrc_list_file = "htrc-ef-all-files.txt"; 84 InputStream is = getServletContext().getResourceAsStream("/WEB-INF/" + htrc_list_file); 85 86 try { 87 System.err.println("INFO: Loading in volume IDS: " + htrc_list_file); 88 89 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 90 BufferedReader br = new BufferedReader(isr); 91 92 storeIDs(br); 93 br.close(); 94 } 95 catch (Exception e) { 96 e.printStackTrace(); 97 } 98 } 99 100 101 102 103 83 104 } 84 105
Note:
See TracChangeset
for help on using the changeset viewer.