1 | package org.hathitrust.extractedfeatures;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.io.IOException;
|
---|
5 | import java.net.URI;
|
---|
6 | import java.nio.file.Files;
|
---|
7 | import java.nio.file.Path;
|
---|
8 | import java.nio.file.Paths;
|
---|
9 | import java.util.HashMap;
|
---|
10 | import java.util.List;
|
---|
11 | import java.util.stream.Collectors;
|
---|
12 | import java.util.stream.Stream;
|
---|
13 |
|
---|
14 | import scala.Tuple2;
|
---|
15 |
|
---|
16 | public class UniversalPOSLangMap
|
---|
17 | {
|
---|
18 |
|
---|
19 | protected HashMap<String,HashMap<String,String>> _all_langmaps;
|
---|
20 |
|
---|
21 | protected HashMap<String,Integer> _missing_pos;
|
---|
22 |
|
---|
23 | public UniversalPOSLangMap(String langmap_directory) {
|
---|
24 | System.out.println("Constructing: UniversalPOS Language Map");
|
---|
25 |
|
---|
26 | _missing_pos = new HashMap<String,Integer>();
|
---|
27 |
|
---|
28 | _all_langmaps = new HashMap<String,HashMap<String,String>>();
|
---|
29 |
|
---|
30 | List<Path> langmap_paths = null;
|
---|
31 |
|
---|
32 | URI langmap_directory_uri = null;
|
---|
33 |
|
---|
34 | try {
|
---|
35 | langmap_directory_uri = new URI(langmap_directory);
|
---|
36 | }
|
---|
37 | catch (Exception e) {
|
---|
38 | e.printStackTrace();
|
---|
39 | }
|
---|
40 |
|
---|
41 | Path langmap_directory_path = null;
|
---|
42 | try {
|
---|
43 | // Spark/Hadoop friendly
|
---|
44 | langmap_directory_path = Paths.get(langmap_directory_uri);
|
---|
45 | }
|
---|
46 | catch (Exception e) {
|
---|
47 | // Relative local file-system friendly
|
---|
48 | langmap_directory_path = Paths.get(langmap_directory_uri.getRawPath());
|
---|
49 | }
|
---|
50 |
|
---|
51 |
|
---|
52 | try (Stream<Path> stream_paths = Files.walk(langmap_directory_path)) {
|
---|
53 | langmap_paths = stream_paths
|
---|
54 | .filter(Files::isRegularFile)
|
---|
55 | .collect(Collectors.toList());
|
---|
56 |
|
---|
57 | } catch (IOException e) {
|
---|
58 | e.printStackTrace();
|
---|
59 | }
|
---|
60 |
|
---|
61 | // For-each language file
|
---|
62 | langmap_paths.forEach(langmap_path -> {
|
---|
63 | File langmap_file = langmap_path.toFile();
|
---|
64 | String lang_key = langmap_file.getName().substring(0,2);
|
---|
65 |
|
---|
66 | HashMap<String,String> pos_lookup = new HashMap<String,String>();
|
---|
67 |
|
---|
68 | // For-each line within that language file
|
---|
69 | try (Stream<String> lang_lines = Files.lines(langmap_path)) {
|
---|
70 | lang_lines.forEach(line -> {
|
---|
71 | String[] line_parts = line.split("\\t");
|
---|
72 | if (line_parts.length == 2) {
|
---|
73 | String pos_key = line_parts[0];
|
---|
74 | String pos_val = line_parts[1];
|
---|
75 | pos_lookup.put(pos_key, pos_val);
|
---|
76 | }
|
---|
77 | });
|
---|
78 | } catch (IOException e) {
|
---|
79 | e.printStackTrace();
|
---|
80 | }
|
---|
81 |
|
---|
82 | _all_langmaps.put(lang_key, pos_lookup);
|
---|
83 | });
|
---|
84 |
|
---|
85 | System.out.println("Done Constructing UniversalPOS Language Map");
|
---|
86 |
|
---|
87 | }
|
---|
88 |
|
---|
89 | public int size()
|
---|
90 | {
|
---|
91 | return _all_langmaps.size();
|
---|
92 | }
|
---|
93 |
|
---|
94 | public boolean containsLanguage(String lang_key)
|
---|
95 | {
|
---|
96 | return _all_langmaps.containsKey(lang_key);
|
---|
97 | }
|
---|
98 |
|
---|
99 | public String getUniversalLanguagePOSUnchecked(String lang_key,String opennlp_pos_key)
|
---|
100 | {
|
---|
101 | String universal_pos = null;
|
---|
102 |
|
---|
103 | HashMap<String,String> langmap = _all_langmaps.get(lang_key);
|
---|
104 | if (langmap != null) {
|
---|
105 | universal_pos = langmap.get(opennlp_pos_key);
|
---|
106 | }
|
---|
107 |
|
---|
108 | return universal_pos;
|
---|
109 | }
|
---|
110 |
|
---|
111 | public String getUniversalLanguagePOSChecked(String lang_key,String opennlp_pos_key)
|
---|
112 | {
|
---|
113 | if (!_all_langmaps.containsKey(lang_key)) {
|
---|
114 | // Not a language with a POS map
|
---|
115 | return "";
|
---|
116 | }
|
---|
117 |
|
---|
118 | String universal_pos = null;
|
---|
119 |
|
---|
120 | HashMap<String,String> langmap = _all_langmaps.get(lang_key);
|
---|
121 | universal_pos = langmap.get(opennlp_pos_key);
|
---|
122 |
|
---|
123 | if (universal_pos == null) {
|
---|
124 | String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
|
---|
125 |
|
---|
126 | Integer mpos_freq = 0;
|
---|
127 | if (_missing_pos.containsKey(missing_lang_pos)) {
|
---|
128 | mpos_freq = _missing_pos.get(missing_lang_pos);
|
---|
129 | }
|
---|
130 | else {
|
---|
131 | System.err.println("Warning: for language key '"+lang_key
|
---|
132 | +"' failed to find POS '" + opennlp_pos_key + "'");
|
---|
133 | System.err.println("Defaulting to POS 'X' (i.e., 'other')");
|
---|
134 | }
|
---|
135 | mpos_freq++;
|
---|
136 | _missing_pos.put(missing_lang_pos,mpos_freq);
|
---|
137 |
|
---|
138 | universal_pos = "X";
|
---|
139 | }
|
---|
140 |
|
---|
141 | return universal_pos;
|
---|
142 | }
|
---|
143 |
|
---|
144 | public Tuple2<String,String> getUniversalLanguagePOSPair(String[] lang_keys,String opennlp_pos_key)
|
---|
145 | {
|
---|
146 | String universal_pos = null;
|
---|
147 | String selected_lang = null;
|
---|
148 |
|
---|
149 | for (int li=0; li<lang_keys.length; li++) {
|
---|
150 | String lang_key = lang_keys[li];
|
---|
151 |
|
---|
152 | universal_pos = getUniversalLanguagePOSUnchecked(lang_key,opennlp_pos_key);
|
---|
153 | if (universal_pos != null) {
|
---|
154 | selected_lang = lang_key;
|
---|
155 | break;
|
---|
156 | }
|
---|
157 | }
|
---|
158 |
|
---|
159 | if (universal_pos == null) {
|
---|
160 | // Failed to any match in any of the given languages
|
---|
161 | // => Lock onto the first language (highest probability when modeled)
|
---|
162 | selected_lang = lang_keys[0];
|
---|
163 |
|
---|
164 | if (!_all_langmaps.containsKey(selected_lang)) {
|
---|
165 | // Not a language with a POS map
|
---|
166 | return new Tuple2<String,String>(selected_lang,null);
|
---|
167 | }
|
---|
168 |
|
---|
169 | // If here, then is a POS language => default to "X"
|
---|
170 |
|
---|
171 | String missing_lang_pos = selected_lang + ":" + opennlp_pos_key;
|
---|
172 |
|
---|
173 | Integer mpos_freq = 0;
|
---|
174 | if (_missing_pos.containsKey(missing_lang_pos)) {
|
---|
175 | mpos_freq = _missing_pos.get(missing_lang_pos);
|
---|
176 | }
|
---|
177 | else {
|
---|
178 | System.err.println("Warning: for language key '"+selected_lang
|
---|
179 | +"' failed to find POS '" + opennlp_pos_key + "'");
|
---|
180 | System.err.println("Defaulting to POS 'X' (i.e., 'other')");
|
---|
181 | }
|
---|
182 | mpos_freq++;
|
---|
183 | _missing_pos.put(missing_lang_pos,mpos_freq);
|
---|
184 |
|
---|
185 | universal_pos = "X";
|
---|
186 | }
|
---|
187 |
|
---|
188 | return new Tuple2<String,String>(selected_lang,universal_pos);
|
---|
189 | }
|
---|
190 | }
|
---|