package org.greenstone.server; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.Serializable; import java.sql.Connection; import java.sql.DriverManager; import java.sql.Statement; import java.util.ArrayList; public class GazetteerTrieType2 implements Serializable { private static final long serialVersionUID = -959184305931535981L; int _nameCount = 0; GazetteerTrieNode _topLevelNode = new GazetteerTrieNode(false); //GazetteerTrieTopLevelNode _topLevelNode = new GazetteerTrieTopLevelNode(false); Statement _database = null; /** * Default constructor */ public GazetteerTrieType2() { } /** * Contructor that takes a filename to generate the gazetteer * * @param filename * is the name of the file to use to generate the gazetteer with */ public GazetteerTrieType2(String filename) { System.out.println("Loading " + filename + " as gazetteer"); try { BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename)); String line = ""; ArrayList allPlaceNames = new ArrayList(); int count = 0; while ((line = gazetteerFile.readLine()) != null) { if(count++ % 10000 == 0){System.out.println(count + " entries loaded");} // The file is tab seperated so split it by tabs String[] columns = line.split("\t"); ArrayList placeNames = new ArrayList(); String mainPlaceName = GazetteerHelper.getMainPlaceName(columns); if (mainPlaceName != null) { placeNames.add(mainPlaceName); } ArrayList alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns); if (alternatePlaceNames != null) { placeNames.addAll(alternatePlaceNames); } allPlaceNames.addAll(placeNames); // Add the place names to the trie for (String placeName : placeNames) { this.addPlaceName(placeName); } } } catch (Exception ex) { ex.printStackTrace(); } removeAmbiguousPlaceNames(); } /** * Removes a place name from the trie * * @param placeName * is the place name to be removed * @return true if sucessful and false if the place name did not exist */ public boolean removePlaceName(String placeName) { if (placeName.length() == 0) { _topLevelNode.setNameEnd(false); return true; } if (placeName.length() == 1) { _topLevelNode.getChild(placeName.charAt(0)).setNameEnd(false); return true; } GazetteerTrieNode currentNode = _topLevelNode.getChild(placeName.charAt(0)); if(currentNode == null) { System.out.println(placeName.charAt(0) + " = null?"); return false; } for (int i = 1; i < placeName.length(); i++) { currentNode = currentNode.getChild(placeName.charAt(i)); if (currentNode == null) { return false; } } currentNode.setNameEnd(false); return true; } /** * Adds a place name to the trie * * @param placeName * is the place name to add */ public void addPlaceName(String placeName) { if (placeName.length() == 0) { return; } if (placeName.length() == 1) { _topLevelNode.addChild(placeName.charAt(0), true); return; } _topLevelNode.addChild(placeName.charAt(0), false); GazetteerTrieNode currentNode = _topLevelNode.getChild(placeName.charAt(0)); for (int i = 1; i < placeName.length() - 1; i++) { currentNode.addChild(placeName.charAt(i), false); currentNode = currentNode.getChild(placeName.charAt(i)); } currentNode.addChild(placeName.charAt(placeName.length() - 1), true); } /** * Checks to see if a place name exists in the trie * * @param placeName * is the place name to check * @return 1 if the place name exists 0 if the place name does not exist but * there might be a match further down the trie -1 if the place name * does not exist and the trie is at a dead end */ public int checkPlaceName(String placeName) { System.out.println("CHECKING PLACE NAME"); if (placeName.length() == 0) { return 0; } if (placeName.length() == 1) { GazetteerTrieNode node = _topLevelNode.getChild(placeName.charAt(0)); if (node == null) { return -1; } return node.isNameEnd() ? 1 : 0; } GazetteerTrieNode currentNode = _topLevelNode.getChild(placeName.charAt(0)); if (currentNode == null) { return -1; } for (int i = 1; i < placeName.length(); i++) { currentNode = currentNode.getChild(placeName.charAt(i)); if (currentNode == null) { return -1; } } if (currentNode.isNameEnd()) { return 1; } else { return 0; } } /** * Removes place names that are unlikely to be meant as place names in a given text * @param gazetteer is the gazetteer to remove the place names from */ public void removeAmbiguousPlaceNames() { removePlaceName("are"); removePlaceName("is"); removePlaceName("over"); removePlaceName("at"); removePlaceName("of"); removePlaceName("to"); removePlaceName("rule"); removePlaceName("time"); removePlaceName("real"); removePlaceName("national"); removePlaceName("early"); removePlaceName("by"); removePlaceName("as"); removePlaceName("eastern"); removePlaceName("western"); removePlaceName("southern"); removePlaceName("northern"); removePlaceName("east"); removePlaceName("west"); removePlaceName("south"); removePlaceName("north"); removePlaceName("this"); removePlaceName("between"); removePlaceName("many"); removePlaceName("strong"); removePlaceName("economy"); removePlaceName("mall"); removePlaceName("they"); removePlaceName("do"); removePlaceName("image"); removePlaceName("republic"); removePlaceName("section"); removePlaceName("dollar"); removePlaceName("index"); removePlaceName("day"); removePlaceName("council"); removePlaceName("use"); removePlaceName("log"); removePlaceName("logo"); removePlaceName("best"); removePlaceName("go"); removePlaceName("portal"); removePlaceName("list"); removePlaceName("english"); removePlaceName("page"); removePlaceName("see"); removePlaceName("ocean"); removePlaceName("island"); removePlaceName("x"); removePlaceName("country"); removePlaceName("colony"); removePlaceName("christian"); removePlaceName("black"); removePlaceName("independence"); removePlaceName("war"); removePlaceName("no"); removePlaceName("continental"); removePlaceName("continental"); removePlaceName("force"); removePlaceName("reform"); removePlaceName("rush"); removePlaceName("read"); removePlaceName("none"); removePlaceName("justice"); removePlaceName("font"); removePlaceName("u"); removePlaceName("y"); removePlaceName("normal"); removePlaceName("center"); removePlaceName("date"); removePlaceName("story"); removePlaceName("union"); removePlaceName("supreme"); removePlaceName("house"); removePlaceName("court"); removePlaceName("data"); removePlaceName("energy"); removePlaceName("white"); removePlaceName("universal"); removePlaceName("protection"); removePlaceName("great"); removePlaceName("star"); removePlaceName("banner"); removePlaceName("capital"); removePlaceName("much"); removePlaceName("sidney"); removePlaceName("media"); removePlaceName("protection"); addPlaceName("United States"); } } // TreeMap charCount = new TreeMap(); // int upperCount = 0; // int lowerCount = 0; // int numbers = 0; // int arabicCharacters = 0; // int chineseCharacters = 0; // if (!charCount.containsKey(placeName.charAt(placeName.length() - 1))) // { // charCount.put(placeName.charAt(placeName.length() - 1), 1); // } // else // { // charCount.put(placeName.charAt(placeName.length() - 1), // charCount.get(placeName.charAt(placeName.length() - 1)) + 1); // } // for (Character c : charCount.keySet()) // { // System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c)); // } // // System.out.println("Upper -> " + upperCount); // System.out.println("Lower -> " + lowerCount); // System.out.println("Number -> " + numbers); // System.out.println("Arabic -> " + arabicCharacters); // System.out.println("Chinese -> " + chineseCharacters); // if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z') // { // upperCount++; // } // // if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z') // { // lowerCount++; // } // // if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9') // { // numbers++; // } // // if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711) // { // arabicCharacters++; // } // // if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016) // { // chineseCharacters++; // } // // if (!charCount.containsKey(placeName.charAt(i))) // { // charCount.put(placeName.charAt(i), 1); // } // else // { // charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1); // }