1 | package org.hathitrust.extractedfeatures;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.io.IOException;
|
---|
5 | import java.net.URI;
|
---|
6 | import java.nio.file.Files;
|
---|
7 | import java.nio.file.Path;
|
---|
8 | import java.nio.file.Paths;
|
---|
9 | import java.util.HashMap;
|
---|
10 | import java.util.List;
|
---|
11 | import java.util.stream.Collectors;
|
---|
12 | import java.util.stream.Stream;
|
---|
13 |
|
---|
14 | public class UniversalPOSLangMap
|
---|
15 | {
|
---|
16 |
|
---|
17 | protected HashMap<String,HashMap<String,String>> _all_langmaps;
|
---|
18 |
|
---|
19 | public UniversalPOSLangMap(String langmap_directory) {
|
---|
20 | System.out.println("Constructing: UniversalPOS Language Map");
|
---|
21 |
|
---|
22 | _all_langmaps = new HashMap<String,HashMap<String,String>>();
|
---|
23 |
|
---|
24 | List<Path> langmap_paths = null;
|
---|
25 |
|
---|
26 | URI langmap_directory_uri = null;
|
---|
27 |
|
---|
28 | try {
|
---|
29 | langmap_directory_uri = new URI(langmap_directory);
|
---|
30 | }
|
---|
31 | catch (Exception e) {
|
---|
32 | e.printStackTrace();
|
---|
33 | }
|
---|
34 | try (Stream<Path> stream_paths = Files.walk(Paths.get(langmap_directory_uri))) {
|
---|
35 | langmap_paths = stream_paths
|
---|
36 | .filter(Files::isRegularFile)
|
---|
37 | .collect(Collectors.toList());
|
---|
38 |
|
---|
39 | } catch (IOException e) {
|
---|
40 | e.printStackTrace();
|
---|
41 | }
|
---|
42 |
|
---|
43 | // For-each language file
|
---|
44 | langmap_paths.forEach(langmap_path -> {
|
---|
45 | File langmap_file = langmap_path.toFile();
|
---|
46 | String lang_key = langmap_file.getName().substring(0,2);
|
---|
47 |
|
---|
48 | HashMap<String,String> pos_lookup = new HashMap<String,String>();
|
---|
49 |
|
---|
50 | // For-each line within that language file
|
---|
51 | try (Stream<String> lang_lines = Files.lines(langmap_path)) {
|
---|
52 | lang_lines.forEach(line -> {
|
---|
53 | String[] line_parts = line.split("\\t");
|
---|
54 | if (line_parts.length == 2) {
|
---|
55 | String pos_key = line_parts[0];
|
---|
56 | String pos_val = line_parts[1];
|
---|
57 | pos_lookup.put(pos_key, pos_val);
|
---|
58 | }
|
---|
59 | });
|
---|
60 | } catch (IOException e) {
|
---|
61 | e.printStackTrace();
|
---|
62 | }
|
---|
63 |
|
---|
64 | _all_langmaps.put(lang_key, pos_lookup);
|
---|
65 | });
|
---|
66 |
|
---|
67 |
|
---|
68 |
|
---|
69 |
|
---|
70 | }
|
---|
71 | public boolean containsLanguage(String lang_key)
|
---|
72 | {
|
---|
73 | return _all_langmaps.containsKey(lang_key);
|
---|
74 | }
|
---|
75 |
|
---|
76 | public String getUniversalLanguagePOS(String lang_key,String opennlp_pos_key)
|
---|
77 | {
|
---|
78 | String universal_pos = null;
|
---|
79 |
|
---|
80 | HashMap<String,String> langmap = _all_langmaps.get(lang_key);
|
---|
81 | if (langmap != null) {
|
---|
82 | universal_pos = langmap.get(opennlp_pos_key);
|
---|
83 | }
|
---|
84 |
|
---|
85 | return universal_pos;
|
---|
86 | }
|
---|
87 |
|
---|
88 | } |
---|