source: gs3-extensions/atlas-src/trunk/src/org/greenstone/atlas/server/GazetteerTrieType1.java@ 23906

Last change on this file since 23906 was 23906, checked in by sjm84, 13 years ago

Committing most recent version of ATLAS

File size: 8.6 KB
Line 
1package org.greenstone.server;
2
3import java.io.BufferedReader;
4import java.io.FileReader;
5import java.io.Serializable;
6import java.sql.Statement;
7import java.util.ArrayList;
8
9public class GazetteerTrieType1 implements Serializable
10{
11 private static final long serialVersionUID = -959184305931535981L;
12
13 int _nameCount = 0;
14
15 //GazetteerTrieNode _topLevelNode = new GazetteerTrieNode(false);
16 GazetteerTrieFullNode _topLevelNode = new GazetteerTrieFullNode(false);
17
18 Statement _database = null;
19
20 /**
21 * Default constructor
22 */
23 public GazetteerTrieType1()
24 {
25 }
26
27 /**
28 * Contructor that takes a filename to generate the gazetteer
29 *
30 * @param filename
31 * is the name of the file to use to generate the gazetteer with
32 */
33 public GazetteerTrieType1(String filename)
34 {
35 System.out.println("Loading " + filename + " as gazetteer");
36 try
37 {
38 BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename));
39 String line = "";
40
41 ArrayList<String> allPlaceNames = new ArrayList<String>();
42
43 int count = 0;
44 while ((line = gazetteerFile.readLine()) != null)
45 {
46 if(count++ % 10000 == 0){System.out.println(count + " entries loaded");}
47 // The file is tab seperated so split it by tabs
48 String[] columns = line.split("\t");
49
50 ArrayList<String> placeNames = new ArrayList<String>();
51
52 String mainPlaceName = GazetteerHelper.getMainPlaceName(columns);
53 if (mainPlaceName != null)
54 {
55 placeNames.add(mainPlaceName);
56 }
57
58 ArrayList<String> alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns);
59 if (alternatePlaceNames != null)
60 {
61 placeNames.addAll(alternatePlaceNames);
62 }
63
64 allPlaceNames.addAll(placeNames);
65
66 // Add the place names to the trie
67 for (String placeName : placeNames)
68 {
69 this.addPlaceName(placeName);
70 }
71 }
72 }
73 catch (Exception ex)
74 {
75 ex.printStackTrace();
76 }
77 removeAmbiguousPlaceNames();
78 }
79
80 /**
81 * Removes a place name from the trie
82 *
83 * @param placeName
84 * is the place name to be removed
85 * @return true if sucessful and false if the place name did not exist
86 */
87
88 public boolean removePlaceName(String placeName)
89 {
90 if (placeName.length() == 0)
91 {
92 _topLevelNode.setNameEnd(false);
93 return true;
94 }
95 if (placeName.length() == 1)
96 {
97 _topLevelNode.getChild(placeName.charAt(0)).setNameEnd(false);
98 return true;
99 }
100 GazetteerTrieFullNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
101 if(currentNode == null)
102 {
103 System.out.println(placeName.charAt(0) + " = null?");
104 return false;
105 }
106
107 for (int i = 1; i < placeName.length(); i++)
108 {
109 currentNode = currentNode.getChild(placeName.charAt(i));
110 if (currentNode == null)
111 {
112 return false;
113 }
114 }
115
116 currentNode.setNameEnd(false);
117 return true;
118 }
119
120 /**
121 * Adds a place name to the trie
122 *
123 * @param placeName
124 * is the place name to add
125 */
126 public void addPlaceName(String placeName)
127 {
128 if (placeName.length() == 0)
129 {
130 return;
131 }
132
133 if (placeName.length() == 1)
134 {
135 _topLevelNode.addChild(placeName.charAt(0), true);
136 return;
137 }
138
139 _topLevelNode.addChild(placeName.charAt(0), false);
140 GazetteerTrieFullNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
141 for (int i = 1; i < placeName.length() - 1; i++)
142 {
143 currentNode.addChild(placeName.charAt(i), false);
144 currentNode = currentNode.getChild(placeName.charAt(i));
145 }
146
147 currentNode.addChild(placeName.charAt(placeName.length() - 1), true);
148 }
149
150 /**
151 * Checks to see if a place name exists in the trie
152 *
153 * @param placeName
154 * is the place name to check
155 * @return 1 if the place name exists 0 if the place name does not exist but
156 * there might be a match further down the trie -1 if the place name
157 * does not exist and the trie is at a dead end
158 */
159 public int checkPlaceName(String placeName)
160 {
161 if (placeName.length() == 0)
162 {
163 return 0;
164 }
165
166 if (placeName.length() == 1)
167 {
168 GazetteerTrieFullNode node = _topLevelNode.getChild(placeName.charAt(0));
169 if (node == null)
170 {
171 return -1;
172 }
173
174 return node.isNameEnd() ? 1 : 0;
175 }
176
177 GazetteerTrieFullNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
178 if (currentNode == null)
179 {
180 return -1;
181 }
182 for (int i = 1; i < placeName.length(); i++)
183 {
184 currentNode = currentNode.getChild(placeName.charAt(i));
185 if (currentNode == null)
186 {
187 return -1;
188 }
189 }
190
191 if (currentNode.isNameEnd())
192 {
193 return 1;
194 }
195 else
196 {
197 return 0;
198 }
199 }
200
201 /**
202 * Removes place names that are unlikely to be meant as place names in a given text
203 * @param gazetteer is the gazetteer to remove the place names from
204 */
205 public void removeAmbiguousPlaceNames()
206 {
207 removePlaceName("are"); removePlaceName("is");
208 removePlaceName("over"); removePlaceName("at");
209 removePlaceName("of"); removePlaceName("to");
210 removePlaceName("rule"); removePlaceName("time");
211 removePlaceName("real"); removePlaceName("national");
212 removePlaceName("early"); removePlaceName("by");
213 removePlaceName("as"); removePlaceName("eastern");
214 removePlaceName("western"); removePlaceName("southern");
215 removePlaceName("northern"); removePlaceName("east");
216 removePlaceName("west"); removePlaceName("south");
217 removePlaceName("north"); removePlaceName("this");
218 removePlaceName("between"); removePlaceName("many");
219 removePlaceName("strong"); removePlaceName("economy");
220 removePlaceName("mall"); removePlaceName("they");
221 removePlaceName("do"); removePlaceName("image");
222 removePlaceName("republic"); removePlaceName("section");
223 removePlaceName("dollar"); removePlaceName("index");
224 removePlaceName("day"); removePlaceName("council");
225 removePlaceName("use"); removePlaceName("log");
226 removePlaceName("logo"); removePlaceName("best");
227 removePlaceName("go"); removePlaceName("portal");
228 removePlaceName("list"); removePlaceName("english");
229 removePlaceName("page"); removePlaceName("see");
230 removePlaceName("ocean"); removePlaceName("island");
231 removePlaceName("x"); removePlaceName("country");
232 removePlaceName("colony"); removePlaceName("christian");
233 removePlaceName("black"); removePlaceName("independence");
234 removePlaceName("war"); removePlaceName("no");
235 removePlaceName("continental"); removePlaceName("continental");
236 removePlaceName("force"); removePlaceName("reform");
237 removePlaceName("rush"); removePlaceName("read");
238 removePlaceName("none"); removePlaceName("justice");
239 removePlaceName("font"); removePlaceName("u");
240 removePlaceName("y"); removePlaceName("normal");
241 removePlaceName("center"); removePlaceName("date");
242 removePlaceName("story"); removePlaceName("union");
243 removePlaceName("supreme"); removePlaceName("house");
244 removePlaceName("court"); removePlaceName("data");
245 removePlaceName("energy"); removePlaceName("white");
246 removePlaceName("universal"); removePlaceName("protection");
247 removePlaceName("great"); removePlaceName("star");
248 removePlaceName("banner"); removePlaceName("capital");
249 removePlaceName("much"); removePlaceName("sidney");
250 removePlaceName("media"); removePlaceName("protection");
251
252 addPlaceName("United States");
253 }
254}
255
256// TreeMap<Character, Integer> charCount = new TreeMap<Character, Integer>();
257// int upperCount = 0;
258// int lowerCount = 0;
259// int numbers = 0;
260// int arabicCharacters = 0;
261// int chineseCharacters = 0;
262
263// if (!charCount.containsKey(placeName.charAt(placeName.length() - 1)))
264// {
265// charCount.put(placeName.charAt(placeName.length() - 1), 1);
266// }
267// else
268// {
269// charCount.put(placeName.charAt(placeName.length() - 1),
270// charCount.get(placeName.charAt(placeName.length() - 1)) + 1);
271// }
272
273// for (Character c : charCount.keySet())
274// {
275// System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c));
276// }
277//
278// System.out.println("Upper -> " + upperCount);
279// System.out.println("Lower -> " + lowerCount);
280// System.out.println("Number -> " + numbers);
281// System.out.println("Arabic -> " + arabicCharacters);
282// System.out.println("Chinese -> " + chineseCharacters);
283
284// if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z')
285// {
286// upperCount++;
287// }
288//
289// if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z')
290// {
291// lowerCount++;
292// }
293//
294// if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9')
295// {
296// numbers++;
297// }
298//
299// if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711)
300// {
301// arabicCharacters++;
302// }
303//
304// if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016)
305// {
306// chineseCharacters++;
307// }
308//
309// if (!charCount.containsKey(placeName.charAt(i)))
310// {
311// charCount.put(placeName.charAt(i), 1);
312// }
313// else
314// {
315// charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1);
316// }
Note: See TracBrowser for help on using the repository browser.