source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java@ 30944

Last change on this file since 30944 was 30944, checked in by davidb, 7 years ago

Forcer higher partition (6) than default, which seems to be 2

  • Property svn:executable set to *
File size: 4.2 KB
Line 
1package org.hathitrust;
2
3import java.io.IOException;
4import java.io.Serializable;
5import java.nio.charset.StandardCharsets;
6import java.nio.file.Files;
7import java.nio.file.Path;
8import java.nio.file.Paths;
9import java.util.List;
10
11import org.apache.commons.cli.*;
12
13import org.apache.spark.api.java.*;
14import org.apache.spark.SparkConf;
15import org.apache.spark.api.java.function.Function;
16
17public class PrepareForIngest implements Serializable
18{
19 private static final long serialVersionUID = 1L;
20
21 protected String _input_dir;
22 protected String _json_list_filename;
23 protected String _output_dir;
24
25 protected int _verbosity;
26
27 public PrepareForIngest(String input_dir, String json_list_filename, String output_dir, int verbosity)
28 {
29 _input_dir = input_dir;
30 _json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir;
31 _output_dir = output_dir;
32 _verbosity = verbosity;
33 }
34
35 public void exec()
36 {
37 String spark_app_name = "HathiTrust Extract Features: Prepare for Solr Ingest";
38 spark_app_name += " [" + _json_list_filename + "]";
39
40 SparkConf conf = new SparkConf().setAppName(spark_app_name);
41 JavaSparkContext jsc = new JavaSparkContext(conf);
42 //ClusterFileIO.init(_input_dir);
43
44 // Check output directory exists, and create it if not
45
46
47 if (_verbosity >= 1) {
48 System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
49 System.out.println("Default Parallelism: " + jsc.defaultParallelism());
50 }
51
52 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,6).cache();
53
54 JavaRDD<String> json_ids = json_list_data.flatMap(new PagedJSON(_input_dir,_verbosity));
55
56
57 //long numAs = json_list_data.filter(new ContainsA()).count();
58
59
60 /*
61 long numBs = json_list_data.filter(new Function<String, Boolean>() {
62 public Boolean call(String s) { return s.contains("b"); }
63 }).count();
64
65 System.out.println("#### Lines with a: " + numAs + ", lines with b: " + numBs);
66 */
67 long num_ids = json_ids.count();
68 System.out.println("");
69 System.out.println("############");
70 System.out.println("# Number of page ids: " + num_ids);
71 System.out.println("############");
72 System.out.println("");
73
74 jsc.close();
75 }
76
77 public static void main(String[] args) {
78
79
80 Options options = new Options();
81
82 //.withType(Integer.class)
83
84 options.addOption(OptionBuilder.withLongOpt("verbosity")
85 .withDescription("Set to control the level of debugging output [0=none, 1=some, 2=lots]")
86 .hasArg()
87 .withArgName("v")
88 .isRequired(false)
89 .create());
90
91 //Option num_cores_opt = new Option("n", "num-cores", true, "Number of cores to use");
92 //num_cores_opt.setRequired(false);
93 //options.addOption(num_cores_opt);
94
95 //CommandLineParser parser = new DefaultParser(); // 1.3 and above
96 CommandLineParser parser = new GnuParser();
97 HelpFormatter formatter = new HelpFormatter();
98 CommandLine cmd;
99
100 try {
101 cmd = parser.parse(options, args);
102 }
103 catch (ParseException e) {
104 System.err.println(e.getMessage());
105 formatter.printHelp("RUN.bash [options] json-file-list.txt input-dir output-dir", options);
106 System.exit(1);
107 return;
108 }
109
110 //value = ((Integer)cmdLine.getParsedOptionValue("num-cores")).intValue();
111 //value = ((Integer)cmdLine.getOptionValue("num-cores","2")).intValue();
112
113 //cmd.hasOption("json-filelist")
114 String verbosity_str = cmd.getOptionValue("verbosity","0");
115 int verbosity = Integer.parseInt(verbosity_str);
116
117 //System.out.println(inputFilePath);
118 //System.out.println(outputFilePath);
119
120
121 String[] filtered_args = cmd.getArgs();
122
123 if (filtered_args.length != 3) {
124 formatter.printHelp("RUN.bash [options] json-filelist.txt input-dir output-dir", options);
125 System.exit(1);
126 }
127 String json_list_filename = filtered_args[0];
128 String input_dir = filtered_args[1];
129 String output_dir = filtered_args[2];
130
131
132 //String json_list_filename = cmd.getArgs()[0]; // args[0];
133 //String json_list_filename = args[0];
134 //int num_cores = 2;
135
136 PrepareForIngest prep_for_ingest = new PrepareForIngest(input_dir,json_list_filename,output_dir,verbosity);
137 prep_for_ingest.exec();
138
139 }
140}
Note: See TracBrowser for help on using the repository browser.