source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30924

Last change on this file since 30924 was 30924, checked in by davidb, 8 years ago

Tidy up of code. Removed commented out code

  • Property svn:executable set to *
File size: 4.0 KB
Line 
1package org.hathitrust;
2
3import java.io.BufferedInputStream;
4import java.io.BufferedReader;
5//import java.io.FileInputStream;
6//import java.io.FileNotFoundException;
7import java.io.IOException;
8import java.io.InputStreamReader;
9//import java.io.UnsupportedEncodingException;
10import java.net.URI;
11//import java.nio.charset.StandardCharsets;
12//import java.nio.file.Files;
13//import java.nio.file.Path;
14//import java.nio.file.Paths;
15import java.util.ArrayList;
16//import java.util.Arrays;
17import java.util.Iterator;
18//import java.util.List;
19
20import org.apache.commons.compress.compressors.CompressorException;
21import org.apache.commons.compress.compressors.CompressorInputStream;
22import org.apache.commons.compress.compressors.CompressorStreamFactory;
23import org.apache.hadoop.conf.Configuration;
24import org.apache.hadoop.fs.FSDataInputStream;
25import org.apache.hadoop.fs.FileSystem;
26import org.apache.hadoop.fs.Path;
27import org.apache.spark.api.java.function.FlatMapFunction;
28import org.json.JSONArray;
29import org.json.JSONObject;
30
31/*
32class PagedJSON implements Function<String, Boolean> {
33
34 private static final long serialVersionUID = 1L;
35
36 public Boolean call(String s) { return s.contains("a"); }
37}
38 */
39
40
41class PagedJSON implements FlatMapFunction<String, String>
42{
43 private static final long serialVersionUID = 1L;
44
45 protected String _input_dir;
46
47 public PagedJSON(String input_dir)
48 {
49 _input_dir = input_dir;
50 }
51
52 protected static BufferedReader getBufferedReaderForCompressedFile(String fileIn)
53 throws CompressorException, IOException
54 {
55 URI uri = URI.create (fileIn);
56 Configuration conf = new Configuration();
57 FileSystem file = FileSystem.get(uri, conf);
58 FSDataInputStream fin = file.open(new Path(uri));
59
60 //FileInputStream fin = new FileInputStream(fileIn);
61 BufferedInputStream bis = new BufferedInputStream(fin);
62 CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(bis);
63 BufferedReader br2 = new BufferedReader(new InputStreamReader(input,"UTF8"));
64 return br2;
65 }
66
67 protected JSONObject readJSONFile(String filename)
68 {
69 //Path path = Paths.get(filename);
70
71 StringBuilder sb = new StringBuilder();
72
73 try {
74
75 String str;
76 BufferedReader br = getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
77 while ((str = br.readLine()) != null) {
78 sb.append(str);
79 //System.out.println(str);
80 }
81
82 br.close();
83
84 //System.err.println("*****" + sb.toString());
85
86 /*
87 List<String> lines = Files.readAllLines(path,StandardCharsets.UTF_8);
88
89
90 for (String line : lines) {
91 sb.append(line);
92
93 }
94 */
95
96 }
97 catch (Exception e) {
98 e.printStackTrace();
99 }
100
101 JSONObject json_obj = new JSONObject(sb.toString());
102
103
104 return json_obj;
105
106 //return sb.toString();
107 }
108
109 public Iterator<String> call(String s)
110 {
111 JSONObject extracted_feature_record = readJSONFile(s);
112
113 String id = extracted_feature_record.getString("id");
114
115 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
116 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
117
118
119 int ef_page_count = ef_features.getInt("pageCount");
120
121 JSONArray ef_pages = ef_features.getJSONArray("pages");
122 int ef_num_pages = ef_pages.length();
123
124 ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
125 for (int i = 0; i < ef_page_count; i++) {
126 ids.add(id + "." + i);
127 }
128
129 /*
130 for (int i = 0; i < ef_num_pages; i++)
131 {
132 //String post_id = ef_pages.getJSONObject(i).getString("post_id");
133 //......
134 }
135 */
136 //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
137/*
138 JSONArray arr = obj.getJSONArray("posts");
139 for (int i = 0; i < arr.length(); i++)
140 {
141 String post_id = arr.getJSONObject(i).getString("post_id");
142 ......
143 }
144*/
145
146
147 ids.add(id);
148
149 return ids.iterator();
150 }
151}
152
Note: See TracBrowser for help on using the repository browser.