1 | package org.hathitrust.extractedfeatures;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.io.IOException;
|
---|
5 | import java.net.URI;
|
---|
6 | import java.nio.file.Files;
|
---|
7 | import java.nio.file.Path;
|
---|
8 | import java.nio.file.Paths;
|
---|
9 | import java.util.HashMap;
|
---|
10 | import java.util.List;
|
---|
11 | import java.util.stream.Collectors;
|
---|
12 | import java.util.stream.Stream;
|
---|
13 |
|
---|
14 | public class UniversalPOSLangMap
|
---|
15 | {
|
---|
16 |
|
---|
17 | protected HashMap<String,HashMap<String,String>> _all_langmaps;
|
---|
18 |
|
---|
19 | protected HashMap<String,Integer> _missing_pos;
|
---|
20 |
|
---|
21 | public UniversalPOSLangMap(String langmap_directory) {
|
---|
22 | System.out.println("Constructing: UniversalPOS Language Map");
|
---|
23 |
|
---|
24 | _missing_pos = new HashMap<String,Integer>();
|
---|
25 |
|
---|
26 | _all_langmaps = new HashMap<String,HashMap<String,String>>();
|
---|
27 |
|
---|
28 | List<Path> langmap_paths = null;
|
---|
29 |
|
---|
30 | URI langmap_directory_uri = null;
|
---|
31 |
|
---|
32 | try {
|
---|
33 | langmap_directory_uri = new URI(langmap_directory);
|
---|
34 | }
|
---|
35 | catch (Exception e) {
|
---|
36 | e.printStackTrace();
|
---|
37 | }
|
---|
38 | try (Stream<Path> stream_paths = Files.walk(Paths.get(langmap_directory_uri))) {
|
---|
39 | langmap_paths = stream_paths
|
---|
40 | .filter(Files::isRegularFile)
|
---|
41 | .collect(Collectors.toList());
|
---|
42 |
|
---|
43 | } catch (IOException e) {
|
---|
44 | e.printStackTrace();
|
---|
45 | }
|
---|
46 |
|
---|
47 | // For-each language file
|
---|
48 | langmap_paths.forEach(langmap_path -> {
|
---|
49 | File langmap_file = langmap_path.toFile();
|
---|
50 | String lang_key = langmap_file.getName().substring(0,2);
|
---|
51 |
|
---|
52 | HashMap<String,String> pos_lookup = new HashMap<String,String>();
|
---|
53 |
|
---|
54 | // For-each line within that language file
|
---|
55 | try (Stream<String> lang_lines = Files.lines(langmap_path)) {
|
---|
56 | lang_lines.forEach(line -> {
|
---|
57 | String[] line_parts = line.split("\\t");
|
---|
58 | if (line_parts.length == 2) {
|
---|
59 | String pos_key = line_parts[0];
|
---|
60 | String pos_val = line_parts[1];
|
---|
61 | pos_lookup.put(pos_key, pos_val);
|
---|
62 | }
|
---|
63 | });
|
---|
64 | } catch (IOException e) {
|
---|
65 | e.printStackTrace();
|
---|
66 | }
|
---|
67 |
|
---|
68 | _all_langmaps.put(lang_key, pos_lookup);
|
---|
69 | });
|
---|
70 |
|
---|
71 | System.out.println("Done Constructing UniversalPOS Language Map");
|
---|
72 |
|
---|
73 | }
|
---|
74 |
|
---|
75 | public int size()
|
---|
76 | {
|
---|
77 | return _all_langmaps.size();
|
---|
78 | }
|
---|
79 |
|
---|
80 | public boolean containsLanguage(String lang_key)
|
---|
81 | {
|
---|
82 | return _all_langmaps.containsKey(lang_key);
|
---|
83 | }
|
---|
84 |
|
---|
85 | public String getUniversalLanguagePOS(String lang_key,String opennlp_pos_key)
|
---|
86 | {
|
---|
87 | String universal_pos = null;
|
---|
88 |
|
---|
89 | HashMap<String,String> langmap = _all_langmaps.get(lang_key);
|
---|
90 | if (langmap != null) {
|
---|
91 | universal_pos = langmap.get(opennlp_pos_key);
|
---|
92 | if (universal_pos == null) {
|
---|
93 | String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
|
---|
94 |
|
---|
95 | Integer mpos_freq = 0;
|
---|
96 | if (_missing_pos.containsKey(missing_lang_pos)) {
|
---|
97 | mpos_freq = _missing_pos.get(missing_lang_pos);
|
---|
98 | }
|
---|
99 | else {
|
---|
100 | System.err.println("Warning: for language key '"+lang_key
|
---|
101 | +"' failed to find POS '" + opennlp_pos_key + "'");
|
---|
102 | System.err.println("Defaulting to POS 'X' (i.e., 'other')");
|
---|
103 | }
|
---|
104 | mpos_freq++;
|
---|
105 | _missing_pos.put(lang_key,mpos_freq);
|
---|
106 |
|
---|
107 | universal_pos = "X";
|
---|
108 | }
|
---|
109 | }
|
---|
110 |
|
---|
111 | return universal_pos;
|
---|
112 | }
|
---|
113 |
|
---|
114 | }
|
---|