1 | package org.hathitrust.extractedfeatures;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.io.IOException;
|
---|
5 | import java.net.URI;
|
---|
6 | import java.nio.file.Files;
|
---|
7 | import java.nio.file.Path;
|
---|
8 | import java.nio.file.Paths;
|
---|
9 | import java.util.HashMap;
|
---|
10 | import java.util.List;
|
---|
11 | import java.util.stream.Collectors;
|
---|
12 | import java.util.stream.Stream;
|
---|
13 |
|
---|
14 | import scala.Tuple2;
|
---|
15 |
|
---|
16 | public class UniversalPOSLangMap
|
---|
17 | {
|
---|
18 |
|
---|
19 | protected HashMap<String,HashMap<String,String>> _all_langmaps;
|
---|
20 |
|
---|
21 | protected HashMap<String,Integer> _missing_pos;
|
---|
22 |
|
---|
23 | public UniversalPOSLangMap(String langmap_directory) {
|
---|
24 | System.out.println("Constructing: UniversalPOS Language Map");
|
---|
25 |
|
---|
26 | _missing_pos = new HashMap<String,Integer>();
|
---|
27 |
|
---|
28 | _all_langmaps = new HashMap<String,HashMap<String,String>>();
|
---|
29 |
|
---|
30 | List<Path> langmap_paths = null;
|
---|
31 |
|
---|
32 | URI langmap_directory_uri = null;
|
---|
33 |
|
---|
34 | try {
|
---|
35 | langmap_directory_uri = new URI(langmap_directory);
|
---|
36 | }
|
---|
37 | catch (Exception e) {
|
---|
38 | e.printStackTrace();
|
---|
39 | }
|
---|
40 | try (Stream<Path> stream_paths = Files.walk(Paths.get(langmap_directory_uri))) {
|
---|
41 | langmap_paths = stream_paths
|
---|
42 | .filter(Files::isRegularFile)
|
---|
43 | .collect(Collectors.toList());
|
---|
44 |
|
---|
45 | } catch (IOException e) {
|
---|
46 | e.printStackTrace();
|
---|
47 | }
|
---|
48 |
|
---|
49 | // For-each language file
|
---|
50 | langmap_paths.forEach(langmap_path -> {
|
---|
51 | File langmap_file = langmap_path.toFile();
|
---|
52 | String lang_key = langmap_file.getName().substring(0,2);
|
---|
53 |
|
---|
54 | HashMap<String,String> pos_lookup = new HashMap<String,String>();
|
---|
55 |
|
---|
56 | // For-each line within that language file
|
---|
57 | try (Stream<String> lang_lines = Files.lines(langmap_path)) {
|
---|
58 | lang_lines.forEach(line -> {
|
---|
59 | String[] line_parts = line.split("\\t");
|
---|
60 | if (line_parts.length == 2) {
|
---|
61 | String pos_key = line_parts[0];
|
---|
62 | String pos_val = line_parts[1];
|
---|
63 | pos_lookup.put(pos_key, pos_val);
|
---|
64 | }
|
---|
65 | });
|
---|
66 | } catch (IOException e) {
|
---|
67 | e.printStackTrace();
|
---|
68 | }
|
---|
69 |
|
---|
70 | _all_langmaps.put(lang_key, pos_lookup);
|
---|
71 | });
|
---|
72 |
|
---|
73 | System.out.println("Done Constructing UniversalPOS Language Map");
|
---|
74 |
|
---|
75 | }
|
---|
76 |
|
---|
77 | public int size()
|
---|
78 | {
|
---|
79 | return _all_langmaps.size();
|
---|
80 | }
|
---|
81 |
|
---|
82 | public boolean containsLanguage(String lang_key)
|
---|
83 | {
|
---|
84 | return _all_langmaps.containsKey(lang_key);
|
---|
85 | }
|
---|
86 |
|
---|
87 | public String getUniversalLanguagePOSUnchecked(String lang_key,String opennlp_pos_key)
|
---|
88 | {
|
---|
89 | String universal_pos = null;
|
---|
90 |
|
---|
91 | HashMap<String,String> langmap = _all_langmaps.get(lang_key);
|
---|
92 | if (langmap != null) {
|
---|
93 | universal_pos = langmap.get(opennlp_pos_key);
|
---|
94 | }
|
---|
95 |
|
---|
96 | return universal_pos;
|
---|
97 | }
|
---|
98 |
|
---|
99 | public String getUniversalLanguagePOSChecked(String lang_key,String opennlp_pos_key)
|
---|
100 | {
|
---|
101 | if (!_all_langmaps.containsKey(lang_key)) {
|
---|
102 | // Not a language with a POS map
|
---|
103 | return "";
|
---|
104 | }
|
---|
105 |
|
---|
106 | String universal_pos = null;
|
---|
107 |
|
---|
108 | HashMap<String,String> langmap = _all_langmaps.get(lang_key);
|
---|
109 | universal_pos = langmap.get(opennlp_pos_key);
|
---|
110 |
|
---|
111 | if (universal_pos == null) {
|
---|
112 | String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
|
---|
113 |
|
---|
114 | Integer mpos_freq = 0;
|
---|
115 | if (_missing_pos.containsKey(missing_lang_pos)) {
|
---|
116 | mpos_freq = _missing_pos.get(missing_lang_pos);
|
---|
117 | }
|
---|
118 | else {
|
---|
119 | System.err.println("Warning: for language key '"+lang_key
|
---|
120 | +"' failed to find POS '" + opennlp_pos_key + "'");
|
---|
121 | System.err.println("Defaulting to POS 'X' (i.e., 'other')");
|
---|
122 | }
|
---|
123 | mpos_freq++;
|
---|
124 | _missing_pos.put(missing_lang_pos,mpos_freq);
|
---|
125 |
|
---|
126 | universal_pos = "X";
|
---|
127 | }
|
---|
128 |
|
---|
129 | return universal_pos;
|
---|
130 | }
|
---|
131 |
|
---|
132 | public Tuple2<String,String> getUniversalLanguagePOSPair(String[] lang_keys,String opennlp_pos_key)
|
---|
133 | {
|
---|
134 | String universal_pos = null;
|
---|
135 | String selected_lang = null;
|
---|
136 |
|
---|
137 | for (int li=0; li<lang_keys.length; li++) {
|
---|
138 | String lang_key = lang_keys[li];
|
---|
139 |
|
---|
140 | universal_pos = getUniversalLanguagePOSUnchecked(lang_key,opennlp_pos_key);
|
---|
141 | if (universal_pos != null) {
|
---|
142 | selected_lang = lang_key;
|
---|
143 | break;
|
---|
144 | }
|
---|
145 | }
|
---|
146 |
|
---|
147 | if (universal_pos == null) {
|
---|
148 | // Failed to any match in any of the given languages
|
---|
149 | // => Lock onto the first language (highest probability when modeled)
|
---|
150 | selected_lang = lang_keys[0];
|
---|
151 |
|
---|
152 | if (!_all_langmaps.containsKey(selected_lang)) {
|
---|
153 | // Not a language with a POS map
|
---|
154 | return new Tuple2<String,String>(selected_lang,null);
|
---|
155 | }
|
---|
156 |
|
---|
157 | // If here, then is a POS language => default to "X"
|
---|
158 |
|
---|
159 | String missing_lang_pos = selected_lang + ":" + opennlp_pos_key;
|
---|
160 |
|
---|
161 | Integer mpos_freq = 0;
|
---|
162 | if (_missing_pos.containsKey(missing_lang_pos)) {
|
---|
163 | mpos_freq = _missing_pos.get(missing_lang_pos);
|
---|
164 | }
|
---|
165 | else {
|
---|
166 | System.err.println("Warning: for language key '"+selected_lang
|
---|
167 | +"' failed to find POS '" + opennlp_pos_key + "'");
|
---|
168 | System.err.println("Defaulting to POS 'X' (i.e., 'other')");
|
---|
169 | }
|
---|
170 | mpos_freq++;
|
---|
171 | _missing_pos.put(missing_lang_pos,mpos_freq);
|
---|
172 |
|
---|
173 | universal_pos = "X";
|
---|
174 | }
|
---|
175 |
|
---|
176 | return new Tuple2<String,String>(selected_lang,universal_pos);
|
---|
177 | }
|
---|
178 | }
|
---|