source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java@ 30934

Last change on this file since 30934 was 30934, checked in by davidb, 7 years ago

Providing json-filelist now a compulsory argument, rather than an option

  • Property svn:executable set to *
File size: 4.4 KB
Line 
1package org.hathitrust;
2
3import java.io.IOException;
4import java.io.Serializable;
5import java.nio.charset.StandardCharsets;
6import java.nio.file.Files;
7import java.nio.file.Path;
8import java.nio.file.Paths;
9import java.util.List;
10
11import org.apache.commons.cli.*;
12
13import org.apache.spark.api.java.*;
14import org.apache.spark.SparkConf;
15import org.apache.spark.api.java.function.Function;
16
17public class PrepareForIngest implements Serializable
18{
19 private static final long serialVersionUID = 1L;
20
21 protected String _input_dir;
22 protected String _json_list_filename;
23 protected String _output_dir;
24 protected int _verbosity;
25
26 public PrepareForIngest(String input_dir, String json_list_filename, String output_dir, int verbosity)
27 {
28 _input_dir = input_dir;
29 _json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir;
30 _output_dir = output_dir;
31 _verbosity = verbosity;
32 }
33
34 public void exec()
35 {
36 SparkConf conf = new SparkConf().setAppName("HTRC-EF: Prepare for Solr Ingest");
37 JavaSparkContext sc = new JavaSparkContext(conf);
38
39 JavaRDD<String> json_list_data = sc.textFile(_json_list_filename).cache();
40
41 JavaRDD<String> json_ids = json_list_data.flatMap(new PagedJSON(_input_dir));
42
43
44 //long numAs = json_list_data.filter(new ContainsA()).count();
45
46
47/*
48 long numBs = json_list_data.filter(new Function<String, Boolean>() {
49 public Boolean call(String s) { return s.contains("b"); }
50 }).count();
51
52 System.out.println("#### Lines with a: " + numAs + ", lines with b: " + numBs);
53 */
54 long num_ids = json_ids.count();
55 System.out.println("");
56 System.out.println("############");
57 System.out.println("# number of IDS: " + num_ids);
58 System.out.println("############");
59 System.out.println("");
60
61 sc.close();
62 }
63
64 public static void main(String[] args) {
65
66
67 Options options = new Options();
68
69 //.withType(Integer.class)
70
71 options.addOption(OptionBuilder.withLongOpt("verbosity")
72 .withDescription("Set to control the level of debugging output [0=none, 1=some, 2=lots]")
73 .hasArg()
74 .withArgName("v")
75 .isRequired(false)
76 .create());
77
78 //Option num_cores_opt = new Option("n", "num-cores", true, "Number of cores to use");
79 //num_cores_opt.setRequired(false);
80 //options.addOption(num_cores_opt);
81
82 //CommandLineParser parser = new DefaultParser(); // 1.3 and above
83 CommandLineParser parser = new GnuParser();
84 HelpFormatter formatter = new HelpFormatter();
85 CommandLine cmd;
86
87 try {
88 cmd = parser.parse(options, args);
89 }
90 catch (ParseException e) {
91 System.err.println(e.getMessage());
92 //System.err.println("Usage: RUN.bat [options] json-file-list.txt input-dir output-dir");
93 formatter.printHelp("RUN.bash/RUN.bat [options] json-file-list.txt input-dir output-dir", options);
94 //System.err.println(" Where 'filename.txt' contains a list of JSON files, one per line,");
95 //System.err.println(" which use the HathiTrust Extracted Feature JSON format");
96
97 System.exit(1);
98 return;
99 }
100
101 //value = ((Integer)cmdLine.getParsedOptionValue("num-cores")).intValue();
102 //value = ((Integer)cmdLine.getOptionValue("num-cores","2")).intValue();
103
104 //cmd.hasOption("json-filelist")
105 String verbosity_str = cmd.getOptionValue("verbosity","0");
106 int verbosity = Integer.parseInt(verbosity_str);
107
108 //System.out.println(inputFilePath);
109 //System.out.println(outputFilePath);
110
111
112
113 String[] filtered_args = cmd.getArgs();
114
115 if (filtered_args.length != 3) {
116 //System.err.println("Usage: RUN.bat [options] json-filelist.txt input-dir output-dir");
117 formatter.printHelp("RUN.bash/RUN.bat [options] json-filelist.txt input-dir output-dir", options);
118
119 //System.err.println("Usage: RUN.bat [options] input-dir output-dir");
120 //System.err.println(" Where 'filename.txt' contains a list of JSON files, one per line,");
121 //System.err.println(" which use the HathiTrust Extracted Feature JSON format");
122 System.exit(1);
123 }
124 String json_list_filename = filtered_args[0];
125 String input_dir = filtered_args[1];
126 String output_dir = filtered_args[2];
127
128
129 //String json_list_filename = cmd.getArgs()[0]; // args[0];
130 //String json_list_filename = args[0];
131 //int num_cores = 2;
132
133 PrepareForIngest prep_for_ingest = new PrepareForIngest(input_dir,json_list_filename,output_dir,verbosity);
134 prep_for_ingest.exec();
135
136 }
137}
Note: See TracBrowser for help on using the repository browser.