source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30921

Last change on this file since 30921 was 30921, checked in by davidb, 7 years ago

Code change to read in JSON file over HDFS

  • Property svn:executable set to *
File size: 4.2 KB
Line 
1package org.hathitrust;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedReader;
5import java.io.FileInputStream;
6import java.io.FileNotFoundException;
7import java.io.IOException;
8import java.io.InputStreamReader;
9import java.io.UnsupportedEncodingException;
10import java.net.URI;
11import java.nio.charset.StandardCharsets;
12//import java.nio.file.Files;
13//import java.nio.file.Path;
14//import java.nio.file.Paths;
15import java.util.ArrayList;
16import java.util.Arrays;
17import java.util.Iterator;
18import java.util.List;
19
20import org.apache.commons.compress.compressors.CompressorException;
21import org.apache.commons.compress.compressors.CompressorInputStream;
22import org.apache.commons.compress.compressors.CompressorStreamFactory;
23import org.apache.hadoop.conf.Configuration;
24import org.apache.hadoop.fs.FSDataInputStream;
25import org.apache.hadoop.fs.FileSystem;
26import org.apache.hadoop.fs.Path;
27import org.apache.spark.api.java.function.FlatMapFunction;
28import org.json.JSONArray;
29import org.json.JSONObject;
30
31/*
32class PagedJSON implements Function<String, Boolean> {
33
34 private static final long serialVersionUID = 1L;
35
36 public Boolean call(String s) { return s.contains("a"); }
37}
38 */
39
40
41/*
42URI uri = URI.create ("hdfs://host:port/file path");
43Configuration conf = new Configuration();
44FileSystem file = FileSystem.get(uri, conf);
45FSDataInputStream in = file.open(new Path(uri));
46
47*/
48
49class PagedJSON implements FlatMapFunction<String, String>
50{
51 private static final long serialVersionUID = 1L;
52
53 protected String _input_dir;
54
55 public PagedJSON(String input_dir)
56 {
57 _input_dir = input_dir;
58 }
59
60 protected static BufferedReader getBufferedReaderForCompressedFile(String fileIn)
61 throws CompressorException, IOException
62 {
63 URI uri = URI.create (fileIn);
64 Configuration conf = new Configuration();
65 FileSystem file = FileSystem.get(uri, conf);
66 FSDataInputStream fin = file.open(new Path(uri));
67
68 //FileInputStream fin = new FileInputStream(fileIn);
69 BufferedInputStream bis = new BufferedInputStream(fin);
70 CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(bis);
71 BufferedReader br2 = new BufferedReader(new InputStreamReader(input,"UTF8"));
72 return br2;
73 }
74
75 protected JSONObject readJSONFile(String filename)
76 {
77 //Path path = Paths.get(filename);
78
79 StringBuilder sb = new StringBuilder();
80
81 try {
82
83 String str;
84 BufferedReader br = getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
85 while ((str = br.readLine()) != null) {
86 sb.append(str);
87 //System.out.println(str);
88 }
89
90 br.close();
91
92 //System.err.println("*****" + sb.toString());
93
94 /*
95 List<String> lines = Files.readAllLines(path,StandardCharsets.UTF_8);
96
97
98 for (String line : lines) {
99 sb.append(line);
100
101 }
102 */
103
104 }
105 catch (Exception e) {
106 e.printStackTrace();
107 }
108
109 JSONObject json_obj = new JSONObject(sb.toString());
110
111
112 return json_obj;
113
114 //return sb.toString();
115 }
116
117 public Iterator<String> call(String s)
118 {
119 JSONObject extracted_feature_record = readJSONFile(s);
120
121 String id = extracted_feature_record.getString("id");
122
123 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
124 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
125
126
127 int ef_page_count = ef_features.getInt("pageCount");
128
129 JSONArray ef_pages = ef_features.getJSONArray("pages");
130 int ef_num_pages = ef_pages.length();
131
132 ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
133 for (int i = 0; i < ef_page_count; i++) {
134 ids.add(id + "." + i);
135 }
136
137 /*
138 for (int i = 0; i < ef_num_pages; i++)
139 {
140 //String post_id = ef_pages.getJSONObject(i).getString("post_id");
141 //......
142 }
143 */
144 //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
145/*
146 JSONArray arr = obj.getJSONArray("posts");
147 for (int i = 0; i < arr.length(); i++)
148 {
149 String post_id = arr.getJSONObject(i).getString("post_id");
150 ......
151 }
152*/
153
154
155 ids.add(id);
156
157 return ids.iterator();
158 }
159}
160
Note: See TracBrowser for help on using the repository browser.