source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeCatalogLangSequenceFileMap.java@ 32101

Last change on this file since 32101 was 31360, checked in by davidb, 7 years ago

Seems to be Text class not a String class coming out of the seuquenceFiles

  • Property svn:executable set to *
File size: 1.6 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4
5import org.apache.hadoop.io.Text;
6import org.apache.spark.api.java.function.Function;
7import org.json.JSONObject;
8
9class PerVolumeCatalogLangSequenceFileMap implements Function<Text, String>
10{
11 private static final long serialVersionUID = 1L;
12
13 protected String _input_dir;
14 protected int _verbosity;
15
16 boolean _strict_file_io;
17
18 public PerVolumeCatalogLangSequenceFileMap(String input_dir, int verbosity, boolean strict_file_io)
19 {
20 _input_dir = input_dir;
21 _verbosity = verbosity;
22
23 _strict_file_io = strict_file_io;
24 }
25
26 public String call(Text json_text) throws IOException
27 {
28 String catalog_lang = null;
29
30 try {
31 JSONObject extracted_feature_record = new JSONObject(json_text.toString());
32
33 String volume_id = extracted_feature_record.getString("id");
34
35 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
36
37 if (_verbosity >= 1) {
38 System.out.println("Processing: " + volume_id);
39 }
40
41 if (ef_metadata != null) {
42 String ef_catalog_language = ef_metadata.getString("language");
43 if (!ef_catalog_language.equals("")) {
44
45 catalog_lang = ef_catalog_language;
46 }
47 else {
48 System.err.println("No catalog 'language' metadata => Skipping id: " + volume_id);
49 }
50 }
51 else {
52 System.err.println("No 'metadata' section in JSON file => Skipping id: " + volume_id);
53 }
54
55 }
56 catch (Exception e) {
57 if (_strict_file_io) {
58 throw e;
59 }
60 else {
61 e.printStackTrace();
62 }
63 }
64
65 return catalog_lang;
66 }
67}
68
Note: See TracBrowser for help on using the repository browser.