source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeCatalogLangStreamFlatmap.java@ 32101

Last change on this file since 32101 was 31294, checked in by davidb, 7 years ago

Version for language counting the catalog assignment language metadata. Only one entry per volume

  • Property svn:executable set to *
File size: 2.4 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import org.apache.spark.api.java.function.FlatMapFunction;
8import org.apache.spark.util.DoubleAccumulator;
9import org.json.JSONArray;
10import org.json.JSONObject;
11
12class PerVolumeCatalogLangStreamFlatmap implements FlatMapFunction<String, String>
13{
14 private static final long serialVersionUID = 1L;
15
16 protected String _input_dir;
17 protected int _verbosity;
18
19 protected DoubleAccumulator _progress_accum;
20 protected double _progress_step;
21
22 boolean _strict_file_io;
23
24 public PerVolumeCatalogLangStreamFlatmap(String input_dir, int verbosity,
25 DoubleAccumulator progress_accum, double progress_step,
26 boolean strict_file_io)
27 {
28 _input_dir = input_dir;
29 _verbosity = verbosity;
30
31 _progress_accum = progress_accum;
32 _progress_step = progress_step;
33
34 _strict_file_io = strict_file_io;
35 }
36
37 public Iterator<String> call(String json_file_in) throws IOException
38 {
39
40 String full_json_file_in = _input_dir + "/" + json_file_in;
41 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
42
43 ArrayList<String> catalog_lang_list = new ArrayList<String>();
44
45 if (extracted_feature_record != null) {
46 String volume_id = extracted_feature_record.getString("id");
47
48 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
49
50 if (_verbosity >= 1) {
51 System.out.println("Processing: " + json_file_in);
52 }
53
54 if (ef_metadata != null) {
55 String ef_catalog_language = ef_metadata.getString("language");
56 if (!ef_catalog_language.equals("")) {
57
58 catalog_lang_list.add(ef_catalog_language);
59 }
60 else {
61 System.err.println("No catalog 'language' metadata => Skipping id: " + volume_id);
62 }
63 }
64 else {
65 System.err.println("No 'metadata' section in JSON file => Skipping id: " + volume_id);
66 }
67
68 }
69 else {
70 // File did not exist, or could not be parsed
71 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
72 if (_strict_file_io) {
73 throw new IOException(mess);
74 }
75 else {
76 System.err.println("Warning: " + mess);
77 System.out.println("Warning: " + mess);
78 }
79 }
80
81 _progress_accum.add(_progress_step);
82
83 return catalog_lang_list.iterator();
84 }
85
86
87}
88
Note: See TracBrowser for help on using the repository browser.