package org.greenstone.server; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.Serializable; import java.sql.Connection; import java.sql.DriverManager; import java.sql.Statement; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import com.sun.org.apache.bcel.internal.generic.NEW; public class GazetteerTrieType5 implements Serializable { private static final long serialVersionUID = -959184305931535981L; int _nameCount = 0; HashMap _gazetteer = new HashMap(); /** * Default constructor */ public GazetteerTrieType5() { } /** * Contructor that takes a filename to generate the gazetteer * * @param filename * is the name of the file to use to generate the gazetteer with */ public GazetteerTrieType5(String filename) { System.out.println("Loading " + filename + " as gazetteer"); try { BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename)); String line = ""; ArrayList allPlaceNames = new ArrayList(); int count = 0; while ((line = gazetteerFile.readLine()) != null) { if (count++ % 10000 == 0) { System.out.println(count + " entries loaded"); } // The file is tab seperated so split it by tabs String[] columns = line.split("\t"); ArrayList placeNames = new ArrayList(); String mainPlaceName = GazetteerHelper.getMainPlaceName(columns); if (mainPlaceName != null) { placeNames.add(mainPlaceName); } ArrayList alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns); if (alternatePlaceNames != null) { placeNames.addAll(alternatePlaceNames); } allPlaceNames.addAll(placeNames); // Add the place names to the trie for (String placeName : placeNames) { this.addPlaceName(placeName); } } } catch (Exception ex) { ex.printStackTrace(); } removeAmbiguousPlaceNames(); } /** * Removes a place name from the trie * * @param placeName * is the place name to be removed * @return true if sucessful and false if the place name did not exist */ public void removePlaceName(String placeName) { _gazetteer.remove(placeName); } /** * Adds a place name to the trie * * @param placeName * is the place name to add */ public void addPlaceName(String placeName) { String[] words = placeName.split(" "); for (int i = 0; i < (words.length - 1); i++) { _gazetteer.put(words[i], 0); } _gazetteer.put(words[words.length-1], 1); } /** * Checks to see if a place name exists in the trie * * @param placeName * is the place name to check * @return 1 if the place name exists 0 if the place name does not exist but * there might be a match further down the trie -1 if the place name * does not exist and the trie is at a dead end */ public int checkPlaceName(String placeName) { String[] words = placeName.split(" "); for(int i = 0; i < (words.length - 1); i++) { if(_gazetteer.get(words[i]) == null) { return -1; } } if (_gazetteer.get(words[words.length-1]) == null) { return -1; } else { return _gazetteer.get(words[words.length-1]) ; } } /** * Removes place names that are unlikely to be meant as place names in a * given text * * @param gazetteer * is the gazetteer to remove the place names from */ public void removeAmbiguousPlaceNames() { removePlaceName("are"); removePlaceName("is"); removePlaceName("over"); removePlaceName("at"); removePlaceName("of"); removePlaceName("to"); removePlaceName("rule"); removePlaceName("time"); removePlaceName("real"); removePlaceName("national"); removePlaceName("early"); removePlaceName("by"); removePlaceName("as"); removePlaceName("eastern"); removePlaceName("western"); removePlaceName("southern"); removePlaceName("northern"); removePlaceName("east"); removePlaceName("west"); removePlaceName("south"); removePlaceName("north"); removePlaceName("this"); removePlaceName("between"); removePlaceName("many"); removePlaceName("strong"); removePlaceName("economy"); removePlaceName("mall"); removePlaceName("they"); removePlaceName("do"); removePlaceName("image"); removePlaceName("republic"); removePlaceName("section"); removePlaceName("dollar"); removePlaceName("index"); removePlaceName("day"); removePlaceName("council"); removePlaceName("use"); removePlaceName("log"); removePlaceName("logo"); removePlaceName("best"); removePlaceName("go"); removePlaceName("portal"); removePlaceName("list"); removePlaceName("english"); removePlaceName("page"); removePlaceName("see"); removePlaceName("ocean"); removePlaceName("island"); removePlaceName("x"); removePlaceName("country"); removePlaceName("colony"); removePlaceName("christian"); removePlaceName("black"); removePlaceName("independence"); removePlaceName("war"); removePlaceName("no"); removePlaceName("continental"); removePlaceName("force"); removePlaceName("reform"); removePlaceName("rush"); removePlaceName("read"); removePlaceName("none"); removePlaceName("justice"); removePlaceName("font"); removePlaceName("u"); removePlaceName("y"); removePlaceName("normal"); removePlaceName("center"); removePlaceName("date"); removePlaceName("story"); removePlaceName("union"); removePlaceName("supreme"); removePlaceName("house"); removePlaceName("court"); removePlaceName("data"); removePlaceName("energy"); removePlaceName("white"); removePlaceName("universal"); removePlaceName("protection"); removePlaceName("great"); removePlaceName("star"); removePlaceName("banner"); removePlaceName("capital"); removePlaceName("much"); removePlaceName("sidney"); removePlaceName("media"); removePlaceName("protection"); addPlaceName("United States"); } } // TreeMap charCount = new TreeMap(); // int upperCount = 0; // int lowerCount = 0; // int numbers = 0; // int arabicCharacters = 0; // int chineseCharacters = 0; // if (!charCount.containsKey(placeName.charAt(placeName.length() - 1))) // { // charCount.put(placeName.charAt(placeName.length() - 1), 1); // } // else // { // charCount.put(placeName.charAt(placeName.length() - 1), // charCount.get(placeName.charAt(placeName.length() - 1)) + 1); // } // for (Character c : charCount.keySet()) // { // System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c)); // } // // System.out.println("Upper -> " + upperCount); // System.out.println("Lower -> " + lowerCount); // System.out.println("Number -> " + numbers); // System.out.println("Arabic -> " + arabicCharacters); // System.out.println("Chinese -> " + chineseCharacters); // if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z') // { // upperCount++; // } // // if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z') // { // lowerCount++; // } // // if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9') // { // numbers++; // } // // if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711) // { // arabicCharacters++; // } // // if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016) // { // chineseCharacters++; // } // // if (!charCount.containsKey(placeName.charAt(i))) // { // charCount.put(placeName.charAt(i), 1); // } // else // { // charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1); // }