package org.hathitrust.extractedfeatures; import java.io.File; import java.io.IOException; import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.HashMap; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; import scala.Tuple2; public class UniversalPOSLangMap { protected HashMap> _all_langmaps; protected HashMap _missing_pos; public UniversalPOSLangMap(String langmap_directory) { System.out.println("Constructing: UniversalPOS Language Map"); _missing_pos = new HashMap(); _all_langmaps = new HashMap>(); List langmap_paths = null; URI langmap_directory_uri = null; try { langmap_directory_uri = new URI(langmap_directory); } catch (Exception e) { e.printStackTrace(); } Path langmap_directory_path = null; try { // Spark/Hadoop friendly langmap_directory_path = Paths.get(langmap_directory_uri); } catch (Exception e) { // Relative local file-system friendly langmap_directory_path = Paths.get(langmap_directory_uri.getRawPath()); } try (Stream stream_paths = Files.walk(langmap_directory_path)) { langmap_paths = stream_paths .filter(Files::isRegularFile) .collect(Collectors.toList()); } catch (IOException e) { e.printStackTrace(); } // For-each language file langmap_paths.forEach(langmap_path -> { File langmap_file = langmap_path.toFile(); String lang_key = langmap_file.getName().substring(0,2); HashMap pos_lookup = new HashMap(); // For-each line within that language file try (Stream lang_lines = Files.lines(langmap_path)) { lang_lines.forEach(line -> { String[] line_parts = line.split("\\t"); if (line_parts.length == 2) { String pos_key = line_parts[0]; String pos_val = line_parts[1]; pos_lookup.put(pos_key, pos_val); } }); } catch (IOException e) { e.printStackTrace(); } _all_langmaps.put(lang_key, pos_lookup); }); System.out.println("Done Constructing UniversalPOS Language Map"); } public int size() { return _all_langmaps.size(); } public boolean containsLanguage(String lang_key) { return _all_langmaps.containsKey(lang_key); } public String getUniversalLanguagePOSUnchecked(String lang_key,String opennlp_pos_key) { String universal_pos = null; HashMap langmap = _all_langmaps.get(lang_key); if (langmap != null) { universal_pos = langmap.get(opennlp_pos_key); } return universal_pos; } public String getUniversalLanguagePOSChecked(String lang_key,String opennlp_pos_key) { if (!_all_langmaps.containsKey(lang_key)) { // Not a language with a POS map return ""; } String universal_pos = null; HashMap langmap = _all_langmaps.get(lang_key); universal_pos = langmap.get(opennlp_pos_key); if (universal_pos == null) { String missing_lang_pos = lang_key + ":" + opennlp_pos_key; Integer mpos_freq = 0; if (_missing_pos.containsKey(missing_lang_pos)) { mpos_freq = _missing_pos.get(missing_lang_pos); } else { System.err.println("Warning: for language key '"+lang_key +"' failed to find POS '" + opennlp_pos_key + "'"); System.err.println("Defaulting to POS 'X' (i.e., 'other')"); } mpos_freq++; _missing_pos.put(missing_lang_pos,mpos_freq); universal_pos = "X"; } return universal_pos; } public Tuple2 getUniversalLanguagePOSPair(String[] lang_keys,String opennlp_pos_key) { String universal_pos = null; String selected_lang = null; for (int li=0; li Lock onto the first language (highest probability when modeled) selected_lang = lang_keys[0]; if (!_all_langmaps.containsKey(selected_lang)) { // Not a language with a POS map return new Tuple2(selected_lang,null); } // If here, then is a POS language => default to "X" String missing_lang_pos = selected_lang + ":" + opennlp_pos_key; Integer mpos_freq = 0; if (_missing_pos.containsKey(missing_lang_pos)) { mpos_freq = _missing_pos.get(missing_lang_pos); } else { System.err.println("Warning: for language key '"+selected_lang +"' failed to find POS '" + opennlp_pos_key + "'"); System.err.println("Defaulting to POS 'X' (i.e., 'other')"); } mpos_freq++; _missing_pos.put(missing_lang_pos,mpos_freq); universal_pos = "X"; } return new Tuple2(selected_lang,universal_pos); } }