Context Navigation

PerVolumePOSStreamFlatmap.java@ 32101

Last change on this file since 32101 was 31271, checked in by davidb, 7 years ago
Updating of POS code to new files-per-partition paramater, plus some other related tweaks
Property svn:executable set to ``*
File size: 3.2 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.IOException;
4	import java.util.ArrayList;
5	import java.util.Iterator;
6
7	import org.apache.spark.api.java.function.FlatMapFunction;
8	import org.apache.spark.util.DoubleAccumulator;
9	import org.json.JSONArray;
10	import org.json.JSONObject;
11
12	class PerVolumePOSStreamFlatmap implements FlatMapFunction<String, String>
13	{
14	private static final long serialVersionUID = 1L;
15
16	protected String _input_dir;
17	protected int _verbosity;
18
19	protected DoubleAccumulator _progress_accum;
20	protected double _progress_step;
21
22	boolean _strict_file_io;
23
24	public PerVolumePOSStreamFlatmap(String input_dir, int verbosity,
25	DoubleAccumulator progress_accum, double progress_step,
26	boolean strict_file_io)
27	{
28	_input_dir = input_dir;
29	_verbosity = verbosity;
30
31	_progress_accum = progress_accum;
32	_progress_step = progress_step;
33
34	_strict_file_io = strict_file_io;
35	}
36
37	public Iterator<String> call(String json_file_in) throws IOException
38	{
39
40	String full_json_file_in = _input_dir + "/" + json_file_in;
41	JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
42
43	ArrayList<String> all_pos_list = new ArrayList<String>();
44
45	if (extracted_feature_record != null) {
46	String volume_id = extracted_feature_record.getString("id");
47
48	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
49
50	int ef_page_count = ef_features.getInt("pageCount");
51
52	if (_verbosity >= 1) {
53	System.out.println("Processing: " + json_file_in);
54	System.out.println(" pageCount = " + ef_page_count);
55	}
56
57	JSONArray ef_pages = ef_features.getJSONArray("pages");
58	int ef_num_pages = ef_pages.length();
59	if (ef_num_pages != ef_page_count) {
60	System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
61	+" does not match 'pageCount' metadata (" + ef_page_count + ")");
62	}
63
64	if (_verbosity >= 3) {
65	System.out.print(" Pages: ");
66	}
67
68	for (int i = 0; i < ef_page_count; i++) {
69	String formatted_i = String.format("page-%06d", i);
70	String page_id = volume_id + "." + formatted_i;
71
72	if (_verbosity >= 3) {
73	if (i>0) {
74	System.out.print(", ");
75	}
76	System.out.print(page_id);
77	}
78
79	if (i==(ef_page_count-1)) {
80	if (_verbosity >= 3) {
81	System.out.println();
82	}
83	}
84
85	JSONObject ef_page = ef_pages.getJSONObject(i);
86
87	if (ef_page != null) {
88
89	ArrayList<String> page_pos_list = SolrDocJSON.generateTokenPosCountPOSLabels(volume_id, page_id, ef_page);
90	all_pos_list.addAll(page_pos_list);
91	}
92	else {
93	System.err.println("Skipping: " + page_id);
94	}
95	}
96	}
97	else {
98	// File did not exist, or could not be parsed
99	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
100	if (_strict_file_io) {
101	throw new IOException(mess);
102	}
103	else {
104	System.err.println("Warning: " + mess);
105	System.out.println("Warning: " + mess);
106	}
107	}
108
109	_progress_accum.add(_progress_step);
110
111	return all_pos_list.iterator();
112	}
113
114
115	}
116

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumePOSStreamFlatmap.java@ 32101

Download in other formats: