source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeCatalogLangSequenceFileMap.java@ 31359

Last change on this file since 31359 was 31359, checked in by davidb, 7 years ago

Changed over to use sequenceFiles as input

  • Property svn:executable set to *
File size: 1.6 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import org.apache.spark.api.java.function.Function;
5import org.json.JSONObject;
6
7class PerVolumeCatalogLangSequenceFileMap implements Function<String, String>
8{
9 private static final long serialVersionUID = 1L;
10
11 protected String _input_dir;
12 protected int _verbosity;
13
14 boolean _strict_file_io;
15
16 public PerVolumeCatalogLangSequenceFileMap(String input_dir, int verbosity, boolean strict_file_io)
17 {
18 _input_dir = input_dir;
19 _verbosity = verbosity;
20
21 _strict_file_io = strict_file_io;
22 }
23
24 public String call(String json_text) throws IOException
25 {
26 String catalog_lang = null;
27
28 try {
29 JSONObject extracted_feature_record = new JSONObject(json_text);
30
31 String volume_id = extracted_feature_record.getString("id");
32
33 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
34
35 if (_verbosity >= 1) {
36 System.out.println("Processing: " + volume_id);
37 }
38
39 if (ef_metadata != null) {
40 String ef_catalog_language = ef_metadata.getString("language");
41 if (!ef_catalog_language.equals("")) {
42
43 catalog_lang = ef_catalog_language;
44 }
45 else {
46 System.err.println("No catalog 'language' metadata => Skipping id: " + volume_id);
47 }
48 }
49 else {
50 System.err.println("No 'metadata' section in JSON file => Skipping id: " + volume_id);
51 }
52
53 }
54 catch (Exception e) {
55 if (_strict_file_io) {
56 throw e;
57 }
58 else {
59 e.printStackTrace();
60 }
61 }
62
63 return catalog_lang;
64 }
65}
66
Note: See TracBrowser for help on using the repository browser.