Changeset 31260 for other-projects/hathitrust/wcsa
- Timestamp:
- 2016-12-21T00:12:10+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
- Files:
-
- 3 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForPOSCount.java
r31259 r31260 45 45 protected String generateSparkAppName(String exec_mode) 46 46 { 47 String spark_app_name = "[" + exec_mode + "] Extracted Features: Process for POS ";47 String spark_app_name = "[" + exec_mode + "] Extracted Features: Process for POS Count"; 48 48 spark_app_name += " [" + _json_list_filename + "]"; 49 49 … … 79 79 //boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize"); 80 80 81 PerVolumePOSStreamFlatmap paged_solr_ wordfreq_flatmap81 PerVolumePOSStreamFlatmap paged_solr_posfreq_flatmap 82 82 = new PerVolumePOSStreamFlatmap(_input_dir,_verbosity, 83 83 per_vol_progress_accum,per_vol, 84 84 strict_file_io); 85 JavaRDD<String> pos_list = json_list_data.flatMap(paged_solr_ wordfreq_flatmap);85 JavaRDD<String> pos_list = json_list_data.flatMap(paged_solr_posfreq_flatmap); 86 86 pos_list.setName("pos-stream"); 87 87 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31258 r31260 13 13 import java.util.Iterator; 14 14 import org.apache.commons.compress.compressors.CompressorException; 15 import org.json.JSONArray; 15 16 import org.json.JSONObject; 16 17 import org.apache.lucene.analysis.TokenStream; … … 294 295 } 295 296 297 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page) 298 { 299 ArrayList<String> lang_list = new ArrayList<String>();; 300 301 if (ef_page != null) { 302 JSONArray ef_languages = ef_page.getJSONArray("languages"); 303 if (ef_languages != null) { 304 305 int lang_len = ef_languages.length(); 306 for (int i=0; i<lang_len; i++) { 307 JSONObject lang_rec = ef_languages.getJSONObject(i); 308 309 Iterator<String> lang_key_iter = lang_rec.keys(); 310 while (lang_key_iter.hasNext()) { 311 String lang_label = lang_key_iter.next(); 312 313 lang_list.add(lang_label); 314 } 315 } 316 } 317 else { 318 System.err.println("Warning: empty languages field for '" + page_id + "'"); 319 } 320 321 } 322 else { 323 System.err.println("Warning: null page for '" + page_id + "'"); 324 } 325 326 return lang_list; 327 } 328 296 329 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2) 297 330 {
Note:
See TracChangeset
for help on using the changeset viewer.