source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java@ 30918

Last change on this file since 30918 was 30918, checked in by davidb, 7 years ago

More flexible command-line args

  • Property svn:executable set to *
File size: 4.4 KB
Line 
1package org.hathitrust;
2
3import java.io.IOException;
4import java.io.Serializable;
5import java.nio.charset.StandardCharsets;
6import java.nio.file.Files;
7import java.nio.file.Path;
8import java.nio.file.Paths;
9import java.util.List;
10
11import org.apache.commons.cli.*;
12
13import org.apache.spark.api.java.*;
14import org.apache.spark.SparkConf;
15import org.apache.spark.api.java.function.Function;
16
17public class PrepareForIngest implements Serializable
18{
19 private static final long serialVersionUID = 1L;
20
21/*
22 class ContainsA implements Function<String, Boolean> {
23
24 private static final long serialVersionUID = 1L;
25
26 public Boolean call(String s) { return s.contains("a"); }
27 }
28
29 class ConvertJSON implements Function<String, Boolean> {
30
31 private static final long serialVersionUID = 1L;
32
33 public Boolean call(String s) { return s.contains("a"); }
34 }
35*/
36
37 //protected int _num_cores;
38 protected String _input_dir;
39 protected String _json_list_filename;
40 protected String _output_dir;
41
42 public PrepareForIngest(String input_dir, String json_list_filename, String output_dir)
43 {
44 //_num_cores = num_cores;
45 _input_dir = input_dir;
46 _json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir;
47 _output_dir = output_dir;
48 }
49
50 public void exec()
51 {
52 SparkConf conf = new SparkConf().setAppName("HTRC-EF: Prepare for Solr Ingest");
53 JavaSparkContext sc = new JavaSparkContext(conf);
54
55 JavaRDD<String> json_list_data = sc.textFile(_json_list_filename).cache();
56
57 JavaRDD<String> json_ids = json_list_data.flatMap(new PagedJSON(_input_dir));
58
59
60 //long numAs = json_list_data.filter(new ContainsA()).count();
61
62
63/*
64 long numBs = json_list_data.filter(new Function<String, Boolean>() {
65 public Boolean call(String s) { return s.contains("b"); }
66 }).count();
67
68 System.out.println("####**** Lines with a: " + numAs + ", lines with b: " + numBs);
69 */
70 long num_ids = json_ids.count();
71 System.out.println("####**** number of IDS: " + num_ids);
72
73 sc.close();
74 }
75
76 public static void main(String[] args) {
77
78
79 Options options = new Options();
80
81 //.withType(Integer.class)
82
83 options.addOption(OptionBuilder.withLongOpt("json-filelist")
84 .withDescription("Explicit list of JSON files to read in")
85 .hasArg()
86 .withArgName("f")
87 .isRequired(false)
88 .create());
89
90 //Option num_cores_opt = new Option("n", "num-cores", true, "Number of cores to use");
91 //num_cores_opt.setRequired(false);
92 //options.addOption(num_cores_opt);
93
94 //CommandLineParser parser = new DefaultParser(); // 1.3 and above
95 CommandLineParser parser = new GnuParser();
96 HelpFormatter formatter = new HelpFormatter();
97 CommandLine cmd;
98
99 try {
100 cmd = parser.parse(options, args);
101 }
102 catch (ParseException e) {
103 System.err.println(e.getMessage());
104 System.err.println("Usage: RUN.bat [options] input-dir output-dir");
105 formatter.printHelp("utility-name", options);
106 //System.err.println(" Where 'filename.txt' contains a list of JSON files, one per line,");
107 //System.err.println(" which use the HathiTrust Extracted Feature JSON format");
108
109 System.exit(1);
110 return;
111 }
112
113 //value = ((Integer)cmdLine.getParsedOptionValue("num-cores")).intValue();
114 //value = ((Integer)cmdLine.getOptionValue("num-cores","2")).intValue();
115
116 //cmd.hasOption("json-filelist")
117 String json_list_filename = cmd.getOptionValue("json-filelist");
118 //int num_cores = Integer.parseInt(num_cores_str);
119
120 //System.out.println(inputFilePath);
121 //System.out.println(outputFilePath);
122
123
124
125 String[] filtered_args = cmd.getArgs();
126
127 if (filtered_args.length != 2) {
128 System.err.println("Usage: RUN.bat [options] input-dir output-dir");
129 formatter.printHelp("utility-name", options);
130
131 //System.err.println("Usage: RUN.bat [options] input-dir output-dir");
132 //System.err.println(" Where 'filename.txt' contains a list of JSON files, one per line,");
133 //System.err.println(" which use the HathiTrust Extracted Feature JSON format");
134 System.exit(1);
135 }
136 String input_dir = filtered_args[0];
137 String output_dir = filtered_args[1];
138
139
140 //String json_list_filename = cmd.getArgs()[0]; // args[0];
141 //String json_list_filename = args[0];
142 //int num_cores = 2;
143
144 PrepareForIngest prep_for_ingest = new PrepareForIngest(input_dir,json_list_filename,output_dir);
145 prep_for_ingest.exec();
146
147 }
148}
Note: See TracBrowser for help on using the repository browser.