1 | package org.hathitrust.extractedfeatures;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.io.IOException;
|
---|
5 | import java.net.URI;
|
---|
6 | import java.nio.file.Files;
|
---|
7 | import java.nio.file.Path;
|
---|
8 | import java.nio.file.Paths;
|
---|
9 | import java.util.HashMap;
|
---|
10 | import java.util.List;
|
---|
11 | import java.util.stream.Collectors;
|
---|
12 | import java.util.stream.Stream;
|
---|
13 |
|
---|
14 | public class UniversalPOSLangMap
|
---|
15 | {
|
---|
16 |
|
---|
17 | protected HashMap<String,HashMap<String,String>> _all_langmaps;
|
---|
18 |
|
---|
19 | protected HashMap<String,Integer> _missing_pos;
|
---|
20 |
|
---|
21 | public UniversalPOSLangMap(String langmap_directory) {
|
---|
22 | System.out.println("Constructing: UniversalPOS Language Map");
|
---|
23 |
|
---|
24 | _all_langmaps = new HashMap<String,HashMap<String,String>>();
|
---|
25 |
|
---|
26 | List<Path> langmap_paths = null;
|
---|
27 |
|
---|
28 | URI langmap_directory_uri = null;
|
---|
29 |
|
---|
30 | try {
|
---|
31 | langmap_directory_uri = new URI(langmap_directory);
|
---|
32 | }
|
---|
33 | catch (Exception e) {
|
---|
34 | e.printStackTrace();
|
---|
35 | }
|
---|
36 | try (Stream<Path> stream_paths = Files.walk(Paths.get(langmap_directory_uri))) {
|
---|
37 | langmap_paths = stream_paths
|
---|
38 | .filter(Files::isRegularFile)
|
---|
39 | .collect(Collectors.toList());
|
---|
40 |
|
---|
41 | } catch (IOException e) {
|
---|
42 | e.printStackTrace();
|
---|
43 | }
|
---|
44 |
|
---|
45 | // For-each language file
|
---|
46 | langmap_paths.forEach(langmap_path -> {
|
---|
47 | File langmap_file = langmap_path.toFile();
|
---|
48 | String lang_key = langmap_file.getName().substring(0,2);
|
---|
49 |
|
---|
50 | HashMap<String,String> pos_lookup = new HashMap<String,String>();
|
---|
51 |
|
---|
52 | // For-each line within that language file
|
---|
53 | try (Stream<String> lang_lines = Files.lines(langmap_path)) {
|
---|
54 | lang_lines.forEach(line -> {
|
---|
55 | String[] line_parts = line.split("\\t");
|
---|
56 | if (line_parts.length == 2) {
|
---|
57 | String pos_key = line_parts[0];
|
---|
58 | String pos_val = line_parts[1];
|
---|
59 | pos_lookup.put(pos_key, pos_val);
|
---|
60 | }
|
---|
61 | });
|
---|
62 | } catch (IOException e) {
|
---|
63 | e.printStackTrace();
|
---|
64 | }
|
---|
65 |
|
---|
66 | _all_langmaps.put(lang_key, pos_lookup);
|
---|
67 | });
|
---|
68 |
|
---|
69 | System.out.println("Done Constructing UniversalPOS Language Map");
|
---|
70 |
|
---|
71 | }
|
---|
72 |
|
---|
73 | public int size()
|
---|
74 | {
|
---|
75 | return _all_langmaps.size();
|
---|
76 | }
|
---|
77 |
|
---|
78 | public boolean containsLanguage(String lang_key)
|
---|
79 | {
|
---|
80 | return _all_langmaps.containsKey(lang_key);
|
---|
81 | }
|
---|
82 |
|
---|
83 | public String getUniversalLanguagePOS(String lang_key,String opennlp_pos_key)
|
---|
84 | {
|
---|
85 | String universal_pos = null;
|
---|
86 |
|
---|
87 | HashMap<String,String> langmap = _all_langmaps.get(lang_key);
|
---|
88 | if (langmap != null) {
|
---|
89 | universal_pos = langmap.get(opennlp_pos_key);
|
---|
90 | if (universal_pos == null) {
|
---|
91 | String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
|
---|
92 |
|
---|
93 | Integer mpos_freq = 0;
|
---|
94 | if (_missing_pos.containsKey(missing_lang_pos)) {
|
---|
95 | mpos_freq = _missing_pos.get(missing_lang_pos);
|
---|
96 | }
|
---|
97 | else {
|
---|
98 | System.err.println("Warning: for language key '"+lang_key
|
---|
99 | +"' failed to find POS '" + opennlp_pos_key + "'");
|
---|
100 | System.err.println("Defaulting to POS 'X' (i.e., 'other')");
|
---|
101 | }
|
---|
102 | mpos_freq++;
|
---|
103 | _missing_pos.put(lang_key,mpos_freq);
|
---|
104 |
|
---|
105 | universal_pos = "X";
|
---|
106 | }
|
---|
107 | }
|
---|
108 |
|
---|
109 | return universal_pos;
|
---|
110 | }
|
---|
111 |
|
---|
112 | }
|
---|