source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30918

Last change on this file since 30918 was 30918, checked in by davidb, 8 years ago

More flexible command-line args

  • Property svn:executable set to *
File size: 3.8 KB
Line 
1package org.hathitrust;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedReader;
5import java.io.FileInputStream;
6import java.io.FileNotFoundException;
7import java.io.IOException;
8import java.io.InputStreamReader;
9import java.io.UnsupportedEncodingException;
10import java.nio.charset.StandardCharsets;
11import java.nio.file.Files;
12import java.nio.file.Path;
13import java.nio.file.Paths;
14import java.util.ArrayList;
15import java.util.Arrays;
16import java.util.Iterator;
17import java.util.List;
18
19import org.apache.commons.compress.compressors.CompressorException;
20import org.apache.commons.compress.compressors.CompressorInputStream;
21import org.apache.commons.compress.compressors.CompressorStreamFactory;
22import org.apache.spark.api.java.function.FlatMapFunction;
23import org.json.JSONArray;
24import org.json.JSONObject;
25
26/*
27class PagedJSON implements Function<String, Boolean> {
28
29 private static final long serialVersionUID = 1L;
30
31 public Boolean call(String s) { return s.contains("a"); }
32}
33 */
34
35
36/*
37URI uri = URI.create (“hdfs://host:port/file path”);
38Configuration conf = new Configuration();
39FileSystem file = FileSystem.get(uri, conf);
40FSDataInputStream in = file.open(new Path(uri));
41
42*/
43
44class PagedJSON implements FlatMapFunction<String, String>
45{
46 private static final long serialVersionUID = 1L;
47
48 protected String _input_dir;
49
50 public PagedJSON(String input_dir)
51 {
52 _input_dir = input_dir;
53 }
54
55 protected static BufferedReader getBufferedReaderForCompressedFile(String fileIn)
56 throws FileNotFoundException, UnsupportedEncodingException, CompressorException {
57 FileInputStream fin = new FileInputStream(fileIn);
58 BufferedInputStream bis = new BufferedInputStream(fin);
59 CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(bis);
60 BufferedReader br2 = new BufferedReader(new InputStreamReader(input,"UTF8"));
61 return br2;
62 }
63
64 protected JSONObject readJSONFile(String filename)
65 {
66 //Path path = Paths.get(filename);
67
68 StringBuilder sb = new StringBuilder();
69
70 try {
71
72 String str;
73 BufferedReader br = getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
74 while ((str = br.readLine()) != null) {
75 sb.append(str);
76 //System.out.println(str);
77 }
78
79 br.close();
80
81 //System.err.println("*****" + sb.toString());
82
83 /*
84 List<String> lines = Files.readAllLines(path,StandardCharsets.UTF_8);
85
86
87 for (String line : lines) {
88 sb.append(line);
89
90 }
91 */
92
93 }
94 catch (Exception e) {
95 e.printStackTrace();
96 }
97
98 JSONObject json_obj = new JSONObject(sb.toString());
99
100
101 return json_obj;
102
103 //return sb.toString();
104 }
105
106 public Iterator<String> call(String s)
107 {
108 JSONObject extracted_feature_record = readJSONFile(s);
109
110 String id = extracted_feature_record.getString("id");
111
112 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
113 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
114
115
116 int ef_page_count = ef_features.getInt("pageCount");
117
118 JSONArray ef_pages = ef_features.getJSONArray("pages");
119 int ef_num_pages = ef_pages.length();
120
121 ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
122 for (int i = 0; i < ef_page_count; i++) {
123 ids.add(id + "." + i);
124 }
125
126 /*
127 for (int i = 0; i < ef_num_pages; i++)
128 {
129 //String post_id = ef_pages.getJSONObject(i).getString("post_id");
130 //......
131 }
132 */
133 //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
134/*
135 JSONArray arr = obj.getJSONArray("posts");
136 for (int i = 0; i < arr.length(); i++)
137 {
138 String post_id = arr.getJSONObject(i).getString("post_id");
139 ......
140 }
141*/
142
143
144 ids.add(id);
145
146 return ids.iterator();
147 }
148}
149
Note: See TracBrowser for help on using the repository browser.