Context Navigation

PerVolumeWordStreamFlatmap.java@ 31273

Last change on this file since 31273 was 31273, checked in by davidb, 7 years ago
Code moved to store fields for multilingual use using dynamic Solr fields *_htrctoken. Text is now also put in as separate tokens
Property svn:executable set to ``*
File size: 3.3 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.IOException;
4	import java.util.ArrayList;
5	import java.util.Iterator;
6
7	import org.apache.spark.api.java.function.FlatMapFunction;
8	import org.apache.spark.util.DoubleAccumulator;
9	import org.json.JSONArray;
10	import org.json.JSONObject;
11
12	class PerVolumeWordStreamFlatmap implements FlatMapFunction<String, String>
13	{
14	private static final long serialVersionUID = 1L;
15
16	protected String _input_dir;
17	protected int _verbosity;
18
19	protected DoubleAccumulator _progress_accum;
20	protected double _progress_step;
21
22	boolean _icu_tokenize;
23	boolean _strict_file_io;
24
25	public PerVolumeWordStreamFlatmap(String input_dir, int verbosity,
26	DoubleAccumulator progress_accum, double progress_step,
27	boolean icu_tokenize,
28	boolean strict_file_io)
29	{
30	_input_dir = input_dir;
31	_verbosity = verbosity;
32
33	_progress_accum = progress_accum;
34	_progress_step = progress_step;
35
36	_icu_tokenize = icu_tokenize;
37	_strict_file_io = strict_file_io;
38	}
39
40	public Iterator<String> call(String json_file_in) throws IOException
41	{
42
43	String full_json_file_in = _input_dir + "/" + json_file_in;
44	JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
45
46	ArrayList<String> all_word_list = new ArrayList<String>();
47
48	if (extracted_feature_record != null) {
49	String volume_id = extracted_feature_record.getString("id");
50
51	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
52
53	int ef_page_count = ef_features.getInt("pageCount");
54
55	if (_verbosity >= 1) {
56	System.out.println("Processing: " + json_file_in);
57	System.out.println(" pageCount = " + ef_page_count);
58	}
59
60	JSONArray ef_pages = ef_features.getJSONArray("pages");
61	int ef_num_pages = ef_pages.length();
62	if (ef_num_pages != ef_page_count) {
63	System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
64	+" does not match 'pageCount' metadata (" + ef_page_count + ")");
65	}
66
67	if (_verbosity >= 2) {
68	System.out.print(" Pages: ");
69	}
70
71	for (int i = 0; i < ef_page_count; i++) {
72	String formatted_i = String.format("page-%06d", i);
73	String page_id = volume_id + "." + formatted_i;
74
75	if (_verbosity >= 2) {
76	if (i>0) {
77	System.out.print(", ");
78	}
79	System.out.print(page_id);
80	}
81
82	if (i==(ef_page_count-1)) {
83	if (_verbosity >= 2) {
84	System.out.println();
85	}
86	}
87
88	JSONObject ef_page = ef_pages.getJSONObject(i);
89
90	if (ef_page != null) {
91
92	ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountWhitelistText(volume_id, page_id, ef_page, _icu_tokenize);
93	all_word_list.addAll(page_word_list);
94	}
95	else {
96	System.err.println("Skipping: " + page_id);
97	}
98	}
99	}
100	else {
101	// File did not exist, or could not be parsed
102	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
103	if (_strict_file_io) {
104	throw new IOException(mess);
105	}
106	else {
107	System.err.println("Warning: " + mess);
108	System.out.println("Warning: " + mess);
109	}
110	}
111
112	_progress_accum.add(_progress_step);
113
114	return all_word_list.iterator();
115	}
116
117
118	}
119

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeWordStreamFlatmap.java@ 31273

Download in other formats: