source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/UniversalPOSLangMap.java@ 31509

Last change on this file since 31509 was 31509, checked in by davidb, 7 years ago

LangPos determination changed to lock into first match, rather than trying to populate all model-predicted languages with the determined POS (which gets mixed up when l=en,de and POS=NE for example -- de goes wrong, has no match)

File size: 4.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.File;
4import java.io.IOException;
5import java.net.URI;
6import java.nio.file.Files;
7import java.nio.file.Path;
8import java.nio.file.Paths;
9import java.util.HashMap;
10import java.util.List;
11import java.util.stream.Collectors;
12import java.util.stream.Stream;
13
14import scala.Tuple2;
15
16public class UniversalPOSLangMap
17{
18
19 protected HashMap<String,HashMap<String,String>> _all_langmaps;
20
21 protected HashMap<String,Integer> _missing_pos;
22
23 public UniversalPOSLangMap(String langmap_directory) {
24 System.out.println("Constructing: UniversalPOS Language Map");
25
26 _missing_pos = new HashMap<String,Integer>();
27
28 _all_langmaps = new HashMap<String,HashMap<String,String>>();
29
30 List<Path> langmap_paths = null;
31
32 URI langmap_directory_uri = null;
33
34 try {
35 langmap_directory_uri = new URI(langmap_directory);
36 }
37 catch (Exception e) {
38 e.printStackTrace();
39 }
40 try (Stream<Path> stream_paths = Files.walk(Paths.get(langmap_directory_uri))) {
41 langmap_paths = stream_paths
42 .filter(Files::isRegularFile)
43 .collect(Collectors.toList());
44
45 } catch (IOException e) {
46 e.printStackTrace();
47 }
48
49 // For-each language file
50 langmap_paths.forEach(langmap_path -> {
51 File langmap_file = langmap_path.toFile();
52 String lang_key = langmap_file.getName().substring(0,2);
53
54 HashMap<String,String> pos_lookup = new HashMap<String,String>();
55
56 // For-each line within that language file
57 try (Stream<String> lang_lines = Files.lines(langmap_path)) {
58 lang_lines.forEach(line -> {
59 String[] line_parts = line.split("\\t");
60 if (line_parts.length == 2) {
61 String pos_key = line_parts[0];
62 String pos_val = line_parts[1];
63 pos_lookup.put(pos_key, pos_val);
64 }
65 });
66 } catch (IOException e) {
67 e.printStackTrace();
68 }
69
70 _all_langmaps.put(lang_key, pos_lookup);
71 });
72
73 System.out.println("Done Constructing UniversalPOS Language Map");
74
75 }
76
77 public int size()
78 {
79 return _all_langmaps.size();
80 }
81
82 public boolean containsLanguage(String lang_key)
83 {
84 return _all_langmaps.containsKey(lang_key);
85 }
86
87 public String getUniversalLanguagePOSUnchecked(String lang_key,String opennlp_pos_key)
88 {
89 String universal_pos = null;
90
91 HashMap<String,String> langmap = _all_langmaps.get(lang_key);
92 if (langmap != null) {
93 universal_pos = langmap.get(opennlp_pos_key);
94 }
95
96 return universal_pos;
97 }
98
99 public String getUniversalLanguagePOSChecked(String lang_key,String opennlp_pos_key)
100 {
101 if (!_all_langmaps.containsKey(lang_key)) {
102 // Not a language with a POS map
103 return "";
104 }
105
106 String universal_pos = null;
107
108 HashMap<String,String> langmap = _all_langmaps.get(lang_key);
109 universal_pos = langmap.get(opennlp_pos_key);
110
111 if (universal_pos == null) {
112 String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
113
114 Integer mpos_freq = 0;
115 if (_missing_pos.containsKey(missing_lang_pos)) {
116 mpos_freq = _missing_pos.get(missing_lang_pos);
117 }
118 else {
119 System.err.println("Warning: for language key '"+lang_key
120 +"' failed to find POS '" + opennlp_pos_key + "'");
121 System.err.println("Defaulting to POS 'X' (i.e., 'other')");
122 }
123 mpos_freq++;
124 _missing_pos.put(missing_lang_pos,mpos_freq);
125
126 universal_pos = "X";
127 }
128
129 return universal_pos;
130 }
131
132 public Tuple2<String,String> getUniversalLanguagePOSPair(String[] lang_keys,String opennlp_pos_key)
133 {
134 String universal_pos = null;
135 String selected_lang = null;
136
137 for (int li=0; li<lang_keys.length; li++) {
138 String lang_key = lang_keys[li];
139
140 universal_pos = getUniversalLanguagePOSUnchecked(lang_key,opennlp_pos_key);
141 if (universal_pos != null) {
142 selected_lang = lang_key;
143 break;
144 }
145 }
146
147 if (universal_pos == null) {
148 // Failed to any match in any of the given languages
149 // => Lock onto the first language (highest probability when modeled)
150 selected_lang = lang_keys[0];
151
152 if (!_all_langmaps.containsKey(selected_lang)) {
153 // Not a language with a POS map
154 return new Tuple2<String,String>(selected_lang,null);
155 }
156
157 // If here, then is a POS language => default to "X"
158
159 String missing_lang_pos = selected_lang + ":" + opennlp_pos_key;
160
161 Integer mpos_freq = 0;
162 if (_missing_pos.containsKey(missing_lang_pos)) {
163 mpos_freq = _missing_pos.get(missing_lang_pos);
164 }
165 else {
166 System.err.println("Warning: for language key '"+selected_lang
167 +"' failed to find POS '" + opennlp_pos_key + "'");
168 System.err.println("Defaulting to POS 'X' (i.e., 'other')");
169 }
170 mpos_freq++;
171 _missing_pos.put(missing_lang_pos,mpos_freq);
172
173 universal_pos = "X";
174 }
175
176 return new Tuple2<String,String>(selected_lang,universal_pos);
177 }
178}
Note: See TracBrowser for help on using the repository browser.