Changeset 31269 for other-projects/hathitrust
- Timestamp:
- 2016-12-28T10:30:08+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31266 r31269 113 113 ids.add(page_id); 114 114 115 if (i==0) { 116 System.out.println("Sample output JSON page file: " + output_json_bz2); 115 if (_verbosity >=2) { 116 if (i==0) { 117 System.out.println("Sample output JSON page file [i=0]: " + output_json_bz2); 118 } 117 119 } 118 119 120 JSONObject ef_page = ef_pages.getJSONObject(i); 120 121 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeLangStreamFlatmap.java
r31260 r31269 41 41 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in); 42 42 43 ArrayList<String> all_ word_list = new ArrayList<String>();43 ArrayList<String> all_lang_list = new ArrayList<String>(); 44 44 45 45 if (extracted_feature_record != null) { … … 87 87 if (ef_page != null) { 88 88 89 ArrayList<String> page_ word_list= SolrDocJSON.generateTokenPosCountLangLabels(volume_id, page_id, ef_page);90 all_ word_list.addAll(page_word_list);89 ArrayList<String> page_lang_labels = SolrDocJSON.generateTokenPosCountLangLabels(volume_id, page_id, ef_page); 90 all_lang_list.addAll(page_lang_labels); 91 91 } 92 92 else { … … 109 109 _progress_accum.add(_progress_step); 110 110 111 return all_ word_list.iterator();111 return all_lang_list.iterator(); 112 112 } 113 113 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31266 r31269 113 113 DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent"); 114 114 115 System.err.println();116 System.err.println();117 System.err.println();118 System.err.println("****##### _input_dir = " + _input_dir);119 System.err.println();120 System.err.println();121 System.err.println();122 123 115 boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize"); 124 116 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); … … 130 122 //json_list_data_rp.foreach(per_vol_json); 131 123 JavaRDD<String> per_page_ids = json_list_data_rp.flatMap(per_vol_json); 132 long num_page_ids = per_page_ids.count(); 133 134 long num_ids = num_volumes;124 long num_page_ids = per_page_ids.count(); // trigger lazy eval of: flatmap:per-vol 125 126 //long num_ids = num_volumes; 135 127 136 128 System.out.println("");
Note:
See TracChangeset
for help on using the changeset viewer.