source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/UniversalPOSLangMap.java@ 31506

Last change on this file since 31506 was 31506, checked in by davidb, 7 years ago

Forgot to add initialization line. Doh!

File size: 3.0 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.File;
4import java.io.IOException;
5import java.net.URI;
6import java.nio.file.Files;
7import java.nio.file.Path;
8import java.nio.file.Paths;
9import java.util.HashMap;
10import java.util.List;
11import java.util.stream.Collectors;
12import java.util.stream.Stream;
13
14public class UniversalPOSLangMap
15{
16
17 protected HashMap<String,HashMap<String,String>> _all_langmaps;
18
19 protected HashMap<String,Integer> _missing_pos;
20
21 public UniversalPOSLangMap(String langmap_directory) {
22 System.out.println("Constructing: UniversalPOS Language Map");
23
24 _missing_pos = new HashMap<String,Integer>();
25
26 _all_langmaps = new HashMap<String,HashMap<String,String>>();
27
28 List<Path> langmap_paths = null;
29
30 URI langmap_directory_uri = null;
31
32 try {
33 langmap_directory_uri = new URI(langmap_directory);
34 }
35 catch (Exception e) {
36 e.printStackTrace();
37 }
38 try (Stream<Path> stream_paths = Files.walk(Paths.get(langmap_directory_uri))) {
39 langmap_paths = stream_paths
40 .filter(Files::isRegularFile)
41 .collect(Collectors.toList());
42
43 } catch (IOException e) {
44 e.printStackTrace();
45 }
46
47 // For-each language file
48 langmap_paths.forEach(langmap_path -> {
49 File langmap_file = langmap_path.toFile();
50 String lang_key = langmap_file.getName().substring(0,2);
51
52 HashMap<String,String> pos_lookup = new HashMap<String,String>();
53
54 // For-each line within that language file
55 try (Stream<String> lang_lines = Files.lines(langmap_path)) {
56 lang_lines.forEach(line -> {
57 String[] line_parts = line.split("\\t");
58 if (line_parts.length == 2) {
59 String pos_key = line_parts[0];
60 String pos_val = line_parts[1];
61 pos_lookup.put(pos_key, pos_val);
62 }
63 });
64 } catch (IOException e) {
65 e.printStackTrace();
66 }
67
68 _all_langmaps.put(lang_key, pos_lookup);
69 });
70
71 System.out.println("Done Constructing UniversalPOS Language Map");
72
73 }
74
75 public int size()
76 {
77 return _all_langmaps.size();
78 }
79
80 public boolean containsLanguage(String lang_key)
81 {
82 return _all_langmaps.containsKey(lang_key);
83 }
84
85 public String getUniversalLanguagePOS(String lang_key,String opennlp_pos_key)
86 {
87 String universal_pos = null;
88
89 HashMap<String,String> langmap = _all_langmaps.get(lang_key);
90 if (langmap != null) {
91 universal_pos = langmap.get(opennlp_pos_key);
92 if (universal_pos == null) {
93 String missing_lang_pos = lang_key + ":" + opennlp_pos_key;
94
95 Integer mpos_freq = 0;
96 if (_missing_pos.containsKey(missing_lang_pos)) {
97 mpos_freq = _missing_pos.get(missing_lang_pos);
98 }
99 else {
100 System.err.println("Warning: for language key '"+lang_key
101 +"' failed to find POS '" + opennlp_pos_key + "'");
102 System.err.println("Defaulting to POS 'X' (i.e., 'other')");
103 }
104 mpos_freq++;
105 _missing_pos.put(lang_key,mpos_freq);
106
107 universal_pos = "X";
108 }
109 }
110
111 return universal_pos;
112 }
113
114}
Note: See TracBrowser for help on using the repository browser.