Changeset 30951
- Timestamp:
- 2016-10-26T17:54:44+13:00 (6 years ago)
- Location:
- other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java
r30949 r30951 2 2 3 3 import java.io.BufferedReader; 4 import java.io.BufferedWriter; 5 import java.io.IOException; 4 6 import java.util.ArrayList; 5 7 import java.util.Iterator; 6 8 9 import org.apache.commons.compress.compressors.CompressorException; 7 10 import org.apache.spark.api.java.function.FlatMapFunction; 8 11 import org.json.JSONArray; … … 85 88 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2"); 86 89 String page_json_dir = json_dir + "/pages"; 87 //ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);88 System.out.println("mkdir: " + _output_dir + "/" + page_json_dir);90 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 91 //System.out.println("mkdir: " + _output_dir + "/" + page_json_dir); 89 92 90 93 ArrayList<String> ids = new ArrayList<String>(ef_num_pages); … … 97 100 } 98 101 99 // create JSON obj of just the page (for now) 100 // write it out 102 101 103 102 104 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2"; 103 104 105 ids.add(output_json_bz2); 105 106 … … 107 108 System.out.println("Sample output JSON page file: " + output_json_bz2); 108 109 } 110 111 // create JSON obj of just the page (for now), and write it out 112 JSONObject ef_page = ef_pages.getJSONObject(i); 113 try { 114 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_json_bz2); 115 bw.write(ef_page.toString()); 116 bw.close(); 117 } catch (IOException e) { 118 e.printStackTrace(); 119 } catch (CompressorException e) { 120 e.printStackTrace(); 121 } 122 123 109 124 } 110 125 -
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java
r30949 r30951 45 45 JavaRDD<String> json_ids = json_list_data.flatMap(paged_json).cache(); 46 46 47 json_ids.saveAsTextFile("rdd-solr-json-page-files");48 49 47 long num_ids = json_ids.count(); 50 48 System.out.println(""); … … 54 52 System.out.println(""); 55 53 54 System.out.println("############"); 55 String rdd_save_file = "rdd-solr-json-page-files"; 56 json_ids.saveAsTextFile(rdd_save_file); 57 System.out.println("# Saved RDD of Solr JSON page files, top-level, as:"); 58 System.out.println("# " + rdd_save_file); 59 System.out.println("############"); 60 System.out.println(""); 61 56 62 jsc.close(); 57 63 }
Note:
See TracChangeset
for help on using the changeset viewer.