Changeset 30937 for other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java
- Timestamp:
- 2016-10-26T13:44:38+13:00 (8 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java
r30934 r30937 19 19 private static final long serialVersionUID = 1L; 20 20 21 protected String _input_dir; 22 protected String _json_list_filename; 23 protected String _output_dir; 24 protected int _verbosity; 25 21 protected String _input_dir; 22 protected String _json_list_filename; 23 protected String _output_dir; 24 25 protected int _verbosity; 26 26 27 public PrepareForIngest(String input_dir, String json_list_filename, String output_dir, int verbosity) 27 28 { … … 34 35 public void exec() 35 36 { 36 S parkConf conf = new SparkConf().setAppName("HTRC-EF: Prepare for Solr Ingest");37 JavaSparkContext sc = new JavaSparkContext(conf);37 String spark_app_name = "HathiTrust Extract Features: Prepare for Solr Ingest"; 38 spark_app_name += "[" + _json_list_filename + "]"; 38 39 39 JavaRDD<String> json_list_data = sc.textFile(_json_list_filename).cache(); 40 SparkConf conf = new SparkConf().setAppName(spark_app_name); 41 JavaSparkContext jsc = new JavaSparkContext(conf); 42 ClusterFileIO.init(_input_dir); 43 44 // Check output directory exists, and create it if not 45 46 47 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache(); 40 48 41 49 JavaRDD<String> json_ids = json_list_data.flatMap(new PagedJSON(_input_dir)); 42 50 43 51 44 52 //long numAs = json_list_data.filter(new ContainsA()).count(); 45 53 46 47 /*54 55 /* 48 56 long numBs = json_list_data.filter(new Function<String, Boolean>() { 49 57 public Boolean call(String s) { return s.contains("b"); } … … 51 59 52 60 System.out.println("#### Lines with a: " + numAs + ", lines with b: " + numBs); 53 */61 */ 54 62 long num_ids = json_ids.count(); 55 63 System.out.println(""); … … 58 66 System.out.println("############"); 59 67 System.out.println(""); 60 61 sc.close();68 69 jsc.close(); 62 70 } 63 71 … … 90 98 catch (ParseException e) { 91 99 System.err.println(e.getMessage()); 92 //System.err.println("Usage: RUN.bat [options] json-file-list.txt input-dir output-dir"); 93 formatter.printHelp("RUN.bash/RUN.bat [options] json-file-list.txt input-dir output-dir", options); 94 //System.err.println(" Where 'filename.txt' contains a list of JSON files, one per line,"); 95 //System.err.println(" which use the HathiTrust Extracted Feature JSON format"); 96 100 formatter.printHelp("RUN.bash [options] json-file-list.txt input-dir output-dir", options); 97 101 System.exit(1); 98 102 return; … … 110 114 111 115 116 String[] filtered_args = cmd.getArgs(); 112 117 113 String[] filtered_args = cmd.getArgs();114 115 118 if (filtered_args.length != 3) { 116 //System.err.println("Usage: RUN.bat [options] json-filelist.txt input-dir output-dir"); 117 formatter.printHelp("RUN.bash/RUN.bat [options] json-filelist.txt input-dir output-dir", options); 118 119 //System.err.println("Usage: RUN.bat [options] input-dir output-dir"); 120 //System.err.println(" Where 'filename.txt' contains a list of JSON files, one per line,"); 121 //System.err.println(" which use the HathiTrust Extracted Feature JSON format"); 122 System.exit(1); 119 formatter.printHelp("RUN.bash [options] json-filelist.txt input-dir output-dir", options); 120 System.exit(1); 123 121 } 124 122 String json_list_filename = filtered_args[0]; 125 123 String input_dir = filtered_args[1]; 126 124 String output_dir = filtered_args[2]; 127 125 128 126 129 127 //String json_list_filename = cmd.getArgs()[0]; // args[0];
Note:
See TracChangeset
for help on using the changeset viewer.