source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30937

Last change on this file since 30937 was 30937, checked in by davidb, 8 years ago

Expanded set of ClusterFileIO methods

  • Property svn:executable set to *
File size: 2.7 KB
Line 
1package org.hathitrust;
2
3import java.io.BufferedReader;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import org.apache.spark.api.java.function.FlatMapFunction;
8import org.json.JSONArray;
9import org.json.JSONObject;
10
11/*
12class PagedJSON implements Function<String, Boolean> {
13
14 private static final long serialVersionUID = 1L;
15
16 public Boolean call(String s) { return s.contains("a"); }
17}
18 */
19
20
21class PagedJSON implements FlatMapFunction<String, String>
22{
23 private static final long serialVersionUID = 1L;
24
25 protected String _input_dir;
26
27 public PagedJSON(String input_dir)
28 {
29 _input_dir = input_dir;
30 }
31
32 protected JSONObject readJSONFile(String filename)
33 {
34 //Path path = Paths.get(filename);
35
36 StringBuilder sb = new StringBuilder();
37
38 try {
39
40 String str;
41 BufferedReader br = ClusterFileIO.getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
42 while ((str = br.readLine()) != null) {
43 sb.append(str);
44 //System.out.println(str);
45 }
46
47 br.close();
48
49 //System.err.println("*****" + sb.toString());
50
51 /*
52 List<String> lines = Files.readAllLines(path,StandardCharsets.UTF_8);
53
54
55 for (String line : lines) {
56 sb.append(line);
57
58 }
59 */
60
61 }
62 catch (Exception e) {
63 e.printStackTrace();
64 }
65
66 JSONObject json_obj = new JSONObject(sb.toString());
67
68
69 return json_obj;
70
71 //return sb.toString();
72 }
73
74 public Iterator<String> call(String json_file_in)
75 {
76 JSONObject extracted_feature_record = readJSONFile(json_file_in);
77
78 // Check output directory for volume exists, and create it if not
79
80
81 String id = extracted_feature_record.getString("id");
82
83 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
84 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
85
86
87 int ef_page_count = ef_features.getInt("pageCount");
88
89 JSONArray ef_pages = ef_features.getJSONArray("pages");
90 int ef_num_pages = ef_pages.length();
91
92 ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
93 for (int i = 0; i < ef_page_count; i++) {
94 ids.add(id + "." + i);
95 }
96
97 /*
98 for (int i = 0; i < ef_num_pages; i++)
99 {
100 //String post_id = ef_pages.getJSONObject(i).getString("post_id");
101 //......
102 }
103 */
104 //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
105/*
106 JSONArray arr = obj.getJSONArray("posts");
107 for (int i = 0; i < arr.length(); i++)
108 {
109 String post_id = arr.getJSONObject(i).getString("post_id");
110 ......
111 }
112*/
113
114
115 ids.add(id);
116
117 return ids.iterator();
118 }
119}
120
Note: See TracBrowser for help on using the repository browser.