Changeset 31312 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java
- Timestamp:
- 2017-01-21T23:57:09+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java
r31311 r31312 2 2 3 3 import java.io.IOException; 4 import java.io.Reader; 5 import java.io.StringReader; 6 import java.util.ArrayList; 7 import java.util.Iterator; 8 import java.util.List; 9 import java.util.Set; 10 11 import org.apache.lucene.analysis.TokenStream; 12 import org.apache.lucene.analysis.core.LowerCaseFilter; 13 import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; 14 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 4 15 import org.apache.spark.api.java.function.Function; 5 16 import org.apache.spark.util.DoubleAccumulator; 6 17 import org.bson.Document; 18 import org.json.JSONArray; 7 19 import org.json.JSONObject; 8 20 … … 39 51 } 40 52 53 protected void fixup_section(Document ef_count) 54 { 55 Set<String> key_set = ef_count.keySet(); 56 for (String key : key_set) { 57 if (key.matches("\\.")) { 58 String new_key = key.replaceAll("\\.", "<PERIOD>"); 59 ef_count.put(new_key, ef_count.get(key)); 60 ef_count.remove(key); 61 key = new_key; 62 } 63 64 if (key.matches("\\$")) { 65 String new_key = key.replaceAll("\\$", "<DOLLAR>"); 66 ef_count.put(new_key, ef_count.get(key)); 67 ef_count.remove(key); 68 } 69 70 } 71 } 72 73 protected void fixup_page(String volume_id, String page_id, Document ef_page) 74 { 75 if (ef_page != null) { 76 String[] zone_keys = { "header", "body", "footer" }; 77 78 for (String zone_key: zone_keys) { 79 Document ef_zone = (Document)ef_page.get(zone_key); 80 if (ef_zone != null) { 81 String[] count_keys = { "beginCharCounts", "endCharCount", "tokenPosCount" }; 82 83 for (String count_key: count_keys) { 84 Document ef_sub_section = (Document)ef_zone.get(count_key); 85 if (ef_sub_section != null) { 86 fixup_section(ef_sub_section); 87 88 if (count_key.equals("tokenPostCount")) { 89 Set<String> key_set = ef_sub_section.keySet(); 90 for (String key : key_set) { 91 Document token_section = (Document)ef_sub_section.get(key); 92 fixup_section(token_section); 93 } 94 } 95 96 97 } 98 } 99 } 100 } 101 } 102 else { 103 System.err.println("Warning: null page for '" + page_id + "'"); 104 } 105 106 } 107 protected void fixup_volume(String json_file_in, Document extracted_feature_record) 108 { 109 String full_json_file_in = _input_dir + "/" + json_file_in; 110 111 if (extracted_feature_record != null) { 112 String volume_id = extracted_feature_record.getString("id"); 113 extracted_feature_record.put("_id",volume_id); 114 extracted_feature_record.remove("id"); 115 116 Document ef_features = (Document)extracted_feature_record.get("features"); 117 118 int ef_page_count = ef_features.getInteger("pageCount"); 119 120 if (_verbosity >= 1) { 121 System.out.println("Processing: " + json_file_in); 122 System.out.println(" pageCount = " + ef_page_count); 123 } 124 125 List<Document> ef_pages = (List<Document>)ef_features.get("pages"); 126 int ef_num_pages = ef_pages.size(); 127 if (ef_num_pages != ef_page_count) { 128 System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")" 129 +" does not match 'pageCount' metadata (" + ef_page_count + ")"); 130 } 131 132 if (_verbosity >= 2) { 133 System.out.print(" Pages: "); 134 } 135 136 for (int i = 0; i < ef_page_count; i++) { 137 String formatted_i = String.format("page-%06d", i); 138 String page_id = volume_id + "." + formatted_i; 139 140 if (_verbosity >= 2) { 141 if (i>0) { 142 System.out.print(", "); 143 } 144 System.out.print(page_id); 145 } 146 147 if (i==(ef_page_count-1)) { 148 if (_verbosity >= 2) { 149 System.out.println(); 150 } 151 } 152 153 Document ef_page = (Document)ef_pages.get(i); 154 155 if (ef_page != null) { 156 157 fixup_page(volume_id, page_id, ef_page); 158 } 159 else { 160 System.err.println("Skipping: " + page_id); 161 } 162 } 163 } 164 else { 165 // File did not exist, or could not be parsed 166 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'"; 167 168 System.err.println("Warning: " + mess); 169 System.out.println("Warning: " + mess); 170 171 } 172 } 173 41 174 public Integer call(String json_file_in) throws IOException 42 175 { … … 53 186 54 187 Document doc = Document.parse(extracted_feature_json_doc); 188 189 fixup_volume(json_file_in,doc); 190 55 191 collection.insertOne(doc); 56 192
Note:
See TracChangeset
for help on using the changeset viewer.