source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/UniversalPOSLangMap.java@ 31503

Last change on this file since 31503 was 31503, checked in by davidb, 7 years ago

Monitor for missing POS keys, and print out details first time each missing type is encountered

File size: 3.0 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.File;
4import java.io.IOException;
5import java.net.URI;
6import java.nio.file.Files;
7import java.nio.file.Path;
8import java.nio.file.Paths;
9import java.util.HashMap;
10import java.util.List;
11import java.util.stream.Collectors;
12import java.util.stream.Stream;
13
14public class UniversalPOSLangMap
15{
16
17 protected HashMap<String,HashMap<String,String>> _all_langmaps;
18
19 protected HashMap<String,Integer> _missing_pos;
20
21 public UniversalPOSLangMap(String langmap_directory) {
22 System.out.println("Constructing: UniversalPOS Language Map");
23
24 _all_langmaps = new HashMap<String,HashMap<String,String>>();
25
26 List<Path> langmap_paths = null;
27
28 URI langmap_directory_uri = null;
29
30 try {
31 langmap_directory_uri = new URI(langmap_directory);
32 }
33 catch (Exception e) {
34 e.printStackTrace();
35 }
36 try (Stream<Path> stream_paths = Files.walk(Paths.get(langmap_directory_uri))) {
37 langmap_paths = stream_paths
38 .filter(Files::isRegularFile)
39 .collect(Collectors.toList());
40
41 } catch (IOException e) {
42 e.printStackTrace();
43 }
44
45 // For-each language file
46 langmap_paths.forEach(langmap_path -> {
47 File langmap_file = langmap_path.toFile();
48 String lang_key = langmap_file.getName().substring(0,2);
49
50 HashMap<String,String> pos_lookup = new HashMap<String,String>();
51
52 // For-each line within that language file
53 try (Stream<String> lang_lines = Files.lines(langmap_path)) {
54 lang_lines.forEach(line -> {
55 String[] line_parts = line.split("\\t");
56 if (line_parts.length == 2) {
57 String pos_key = line_parts[0];
58 String pos_val = line_parts[1];
59 pos_lookup.put(pos_key, pos_val);
60 }
61 });
62 } catch (IOException e) {
63 e.printStackTrace();
64 }
65
66 _all_langmaps.put(lang_key, pos_lookup);
67 });
68
69 System.out.println("Done Constructing UniversalPOS Language Map");
70
71 }
72
73 public int size()
74 {
75 return _all_langmaps.size();
76 }
77
78 public boolean containsLanguage(String lang_key)
79 {
80 return _all_langmaps.containsKey(lang_key);
81 }
82
83 public String getUniversalLanguagePOS(String lang_key,String opennlp_pos_key)
84 {
85 String universal_pos = null;
86
87 HashMap<String,String> langmap = _all_langmaps.get(lang_key);
88 if (langmap != null) {
89 universal_pos = langmap.get(opennlp_pos_key);
90 if (universal_pos == null) {
91 String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
92
93 Integer mpos_freq = 0;
94 if (_missing_pos.containsKey(missing_lang_pos)) {
95 mpos_freq = _missing_pos.get(missing_lang_pos);
96 }
97 else {
98 System.err.println("Warning: for language key '"+lang_key
99 +"' failed to find POS '" + opennlp_pos_key + "'");
100 System.err.println("Defaulting to POS 'X' (i.e., 'other')");
101 }
102 mpos_freq++;
103 _missing_pos.put(lang_key,mpos_freq);
104
105 universal_pos = "X";
106 }
107 }
108
109 return universal_pos;
110 }
111
112}
Note: See TracBrowser for help on using the repository browser.