source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/UniversalPOSLangMap.java@ 32101

Last change on this file since 32101 was 32101, checked in by davidb, 6 years ago

Tweaks to allow serial ingest to run

File size: 5.1 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.File;
4import java.io.IOException;
5import java.net.URI;
6import java.nio.file.Files;
7import java.nio.file.Path;
8import java.nio.file.Paths;
9import java.util.HashMap;
10import java.util.List;
11import java.util.stream.Collectors;
12import java.util.stream.Stream;
13
14import scala.Tuple2;
15
16public class UniversalPOSLangMap
17{
18
19 protected HashMap<String,HashMap<String,String>> _all_langmaps;
20
21 protected HashMap<String,Integer> _missing_pos;
22
23 public UniversalPOSLangMap(String langmap_directory) {
24 System.out.println("Constructing: UniversalPOS Language Map");
25
26 _missing_pos = new HashMap<String,Integer>();
27
28 _all_langmaps = new HashMap<String,HashMap<String,String>>();
29
30 List<Path> langmap_paths = null;
31
32 URI langmap_directory_uri = null;
33
34 try {
35 langmap_directory_uri = new URI(langmap_directory);
36 }
37 catch (Exception e) {
38 e.printStackTrace();
39 }
40
41 Path langmap_directory_path = null;
42 try {
43 // Spark/Hadoop friendly
44 langmap_directory_path = Paths.get(langmap_directory_uri);
45 }
46 catch (Exception e) {
47 // Relative local file-system friendly
48 langmap_directory_path = Paths.get(langmap_directory_uri.getRawPath());
49 }
50
51
52 try (Stream<Path> stream_paths = Files.walk(langmap_directory_path)) {
53 langmap_paths = stream_paths
54 .filter(Files::isRegularFile)
55 .collect(Collectors.toList());
56
57 } catch (IOException e) {
58 e.printStackTrace();
59 }
60
61 // For-each language file
62 langmap_paths.forEach(langmap_path -> {
63 File langmap_file = langmap_path.toFile();
64 String lang_key = langmap_file.getName().substring(0,2);
65
66 HashMap<String,String> pos_lookup = new HashMap<String,String>();
67
68 // For-each line within that language file
69 try (Stream<String> lang_lines = Files.lines(langmap_path)) {
70 lang_lines.forEach(line -> {
71 String[] line_parts = line.split("\\t");
72 if (line_parts.length == 2) {
73 String pos_key = line_parts[0];
74 String pos_val = line_parts[1];
75 pos_lookup.put(pos_key, pos_val);
76 }
77 });
78 } catch (IOException e) {
79 e.printStackTrace();
80 }
81
82 _all_langmaps.put(lang_key, pos_lookup);
83 });
84
85 System.out.println("Done Constructing UniversalPOS Language Map");
86
87 }
88
89 public int size()
90 {
91 return _all_langmaps.size();
92 }
93
94 public boolean containsLanguage(String lang_key)
95 {
96 return _all_langmaps.containsKey(lang_key);
97 }
98
99 public String getUniversalLanguagePOSUnchecked(String lang_key,String opennlp_pos_key)
100 {
101 String universal_pos = null;
102
103 HashMap<String,String> langmap = _all_langmaps.get(lang_key);
104 if (langmap != null) {
105 universal_pos = langmap.get(opennlp_pos_key);
106 }
107
108 return universal_pos;
109 }
110
111 public String getUniversalLanguagePOSChecked(String lang_key,String opennlp_pos_key)
112 {
113 if (!_all_langmaps.containsKey(lang_key)) {
114 // Not a language with a POS map
115 return "";
116 }
117
118 String universal_pos = null;
119
120 HashMap<String,String> langmap = _all_langmaps.get(lang_key);
121 universal_pos = langmap.get(opennlp_pos_key);
122
123 if (universal_pos == null) {
124 String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
125
126 Integer mpos_freq = 0;
127 if (_missing_pos.containsKey(missing_lang_pos)) {
128 mpos_freq = _missing_pos.get(missing_lang_pos);
129 }
130 else {
131 System.err.println("Warning: for language key '"+lang_key
132 +"' failed to find POS '" + opennlp_pos_key + "'");
133 System.err.println("Defaulting to POS 'X' (i.e., 'other')");
134 }
135 mpos_freq++;
136 _missing_pos.put(missing_lang_pos,mpos_freq);
137
138 universal_pos = "X";
139 }
140
141 return universal_pos;
142 }
143
144 public Tuple2<String,String> getUniversalLanguagePOSPair(String[] lang_keys,String opennlp_pos_key)
145 {
146 String universal_pos = null;
147 String selected_lang = null;
148
149 for (int li=0; li<lang_keys.length; li++) {
150 String lang_key = lang_keys[li];
151
152 universal_pos = getUniversalLanguagePOSUnchecked(lang_key,opennlp_pos_key);
153 if (universal_pos != null) {
154 selected_lang = lang_key;
155 break;
156 }
157 }
158
159 if (universal_pos == null) {
160 // Failed to any match in any of the given languages
161 // => Lock onto the first language (highest probability when modeled)
162 selected_lang = lang_keys[0];
163
164 if (!_all_langmaps.containsKey(selected_lang)) {
165 // Not a language with a POS map
166 return new Tuple2<String,String>(selected_lang,null);
167 }
168
169 // If here, then is a POS language => default to "X"
170
171 String missing_lang_pos = selected_lang + ":" + opennlp_pos_key;
172
173 Integer mpos_freq = 0;
174 if (_missing_pos.containsKey(missing_lang_pos)) {
175 mpos_freq = _missing_pos.get(missing_lang_pos);
176 }
177 else {
178 System.err.println("Warning: for language key '"+selected_lang
179 +"' failed to find POS '" + opennlp_pos_key + "'");
180 System.err.println("Defaulting to POS 'X' (i.e., 'other')");
181 }
182 mpos_freq++;
183 _missing_pos.put(missing_lang_pos,mpos_freq);
184
185 universal_pos = "X";
186 }
187
188 return new Tuple2<String,String>(selected_lang,universal_pos);
189 }
190}
Note: See TracBrowser for help on using the repository browser.