source: gs3-extensions/atlas-src/trunk/src/org/greenstone/server/GazetteerTrieType1.java@ 22272

Last change on this file since 22272 was 22272, checked in by sjm84, 14 years ago

Initial version of ATLAS as an extension

File size: 8.8 KB
Line 
1package org.greenstone.server;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.FileReader;
6import java.io.FileWriter;
7import java.io.Serializable;
8import java.sql.Connection;
9import java.sql.DriverManager;
10import java.sql.Statement;
11import java.util.ArrayList;
12
13public class GazetteerTrieType1 implements Serializable
14{
15 private static final long serialVersionUID = -959184305931535981L;
16
17 int _nameCount = 0;
18
19 //GazetteerTrieNode _topLevelNode = new GazetteerTrieNode(false);
20 GazetteerTrieFullNode _topLevelNode = new GazetteerTrieFullNode(false);
21
22 Statement _database = null;
23
24 /**
25 * Default constructor
26 */
27 public GazetteerTrieType1()
28 {
29 }
30
31 /**
32 * Contructor that takes a filename to generate the gazetteer
33 *
34 * @param filename
35 * is the name of the file to use to generate the gazetteer with
36 */
37 public GazetteerTrieType1(String filename)
38 {
39 System.out.println("Loading " + filename + " as gazetteer");
40 try
41 {
42 BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename));
43 String line = "";
44
45 ArrayList<String> allPlaceNames = new ArrayList<String>();
46
47 int count = 0;
48 while ((line = gazetteerFile.readLine()) != null)
49 {
50 if(count++ % 10000 == 0){System.out.println(count + " entries loaded");}
51 // The file is tab seperated so split it by tabs
52 String[] columns = line.split("\t");
53
54 ArrayList<String> placeNames = new ArrayList<String>();
55
56 String mainPlaceName = GazetteerHelper.getMainPlaceName(columns);
57 if (mainPlaceName != null)
58 {
59 placeNames.add(mainPlaceName);
60 }
61
62 ArrayList<String> alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns);
63 if (alternatePlaceNames != null)
64 {
65 placeNames.addAll(alternatePlaceNames);
66 }
67
68 allPlaceNames.addAll(placeNames);
69
70 // Add the place names to the trie
71 for (String placeName : placeNames)
72 {
73 this.addPlaceName(placeName);
74 }
75 }
76 }
77 catch (Exception ex)
78 {
79 ex.printStackTrace();
80 }
81 removeAmbiguousPlaceNames();
82 }
83
84 /**
85 * Removes a place name from the trie
86 *
87 * @param placeName
88 * is the place name to be removed
89 * @return true if sucessful and false if the place name did not exist
90 */
91
92 public boolean removePlaceName(String placeName)
93 {
94 if (placeName.length() == 0)
95 {
96 _topLevelNode.setNameEnd(false);
97 return true;
98 }
99 if (placeName.length() == 1)
100 {
101 _topLevelNode.getChild(placeName.charAt(0)).setNameEnd(false);
102 return true;
103 }
104 GazetteerTrieFullNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
105 if(currentNode == null)
106 {
107 System.out.println(placeName.charAt(0) + " = null?");
108 return false;
109 }
110
111 for (int i = 1; i < placeName.length(); i++)
112 {
113 currentNode = currentNode.getChild(placeName.charAt(i));
114 if (currentNode == null)
115 {
116 return false;
117 }
118 }
119
120 currentNode.setNameEnd(false);
121 return true;
122 }
123
124 /**
125 * Adds a place name to the trie
126 *
127 * @param placeName
128 * is the place name to add
129 */
130 public void addPlaceName(String placeName)
131 {
132 if (placeName.length() == 0)
133 {
134 return;
135 }
136
137 if (placeName.length() == 1)
138 {
139 _topLevelNode.addChild(placeName.charAt(0), true);
140 return;
141 }
142
143 _topLevelNode.addChild(placeName.charAt(0), false);
144 GazetteerTrieFullNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
145 for (int i = 1; i < placeName.length() - 1; i++)
146 {
147 currentNode.addChild(placeName.charAt(i), false);
148 currentNode = currentNode.getChild(placeName.charAt(i));
149 }
150
151 currentNode.addChild(placeName.charAt(placeName.length() - 1), true);
152 }
153
154 /**
155 * Checks to see if a place name exists in the trie
156 *
157 * @param placeName
158 * is the place name to check
159 * @return 1 if the place name exists 0 if the place name does not exist but
160 * there might be a match further down the trie -1 if the place name
161 * does not exist and the trie is at a dead end
162 */
163 public int checkPlaceName(String placeName)
164 {
165 if (placeName.length() == 0)
166 {
167 return 0;
168 }
169
170 if (placeName.length() == 1)
171 {
172 GazetteerTrieFullNode node = _topLevelNode.getChild(placeName.charAt(0));
173 if (node == null)
174 {
175 return -1;
176 }
177
178 return node.isNameEnd() ? 1 : 0;
179 }
180
181 GazetteerTrieFullNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
182 if (currentNode == null)
183 {
184 return -1;
185 }
186 for (int i = 1; i < placeName.length(); i++)
187 {
188 currentNode = currentNode.getChild(placeName.charAt(i));
189 if (currentNode == null)
190 {
191 return -1;
192 }
193 }
194
195 if (currentNode.isNameEnd())
196 {
197 return 1;
198 }
199 else
200 {
201 return 0;
202 }
203 }
204
205 /**
206 * Removes place names that are unlikely to be meant as place names in a given text
207 * @param gazetteer is the gazetteer to remove the place names from
208 */
209 public void removeAmbiguousPlaceNames()
210 {
211 removePlaceName("are"); removePlaceName("is");
212 removePlaceName("over"); removePlaceName("at");
213 removePlaceName("of"); removePlaceName("to");
214 removePlaceName("rule"); removePlaceName("time");
215 removePlaceName("real"); removePlaceName("national");
216 removePlaceName("early"); removePlaceName("by");
217 removePlaceName("as"); removePlaceName("eastern");
218 removePlaceName("western"); removePlaceName("southern");
219 removePlaceName("northern"); removePlaceName("east");
220 removePlaceName("west"); removePlaceName("south");
221 removePlaceName("north"); removePlaceName("this");
222 removePlaceName("between"); removePlaceName("many");
223 removePlaceName("strong"); removePlaceName("economy");
224 removePlaceName("mall"); removePlaceName("they");
225 removePlaceName("do"); removePlaceName("image");
226 removePlaceName("republic"); removePlaceName("section");
227 removePlaceName("dollar"); removePlaceName("index");
228 removePlaceName("day"); removePlaceName("council");
229 removePlaceName("use"); removePlaceName("log");
230 removePlaceName("logo"); removePlaceName("best");
231 removePlaceName("go"); removePlaceName("portal");
232 removePlaceName("list"); removePlaceName("english");
233 removePlaceName("page"); removePlaceName("see");
234 removePlaceName("ocean"); removePlaceName("island");
235 removePlaceName("x"); removePlaceName("country");
236 removePlaceName("colony"); removePlaceName("christian");
237 removePlaceName("black"); removePlaceName("independence");
238 removePlaceName("war"); removePlaceName("no");
239 removePlaceName("continental"); removePlaceName("continental");
240 removePlaceName("force"); removePlaceName("reform");
241 removePlaceName("rush"); removePlaceName("read");
242 removePlaceName("none"); removePlaceName("justice");
243 removePlaceName("font"); removePlaceName("u");
244 removePlaceName("y"); removePlaceName("normal");
245 removePlaceName("center"); removePlaceName("date");
246 removePlaceName("story"); removePlaceName("union");
247 removePlaceName("supreme"); removePlaceName("house");
248 removePlaceName("court"); removePlaceName("data");
249 removePlaceName("energy"); removePlaceName("white");
250 removePlaceName("universal"); removePlaceName("protection");
251 removePlaceName("great"); removePlaceName("star");
252 removePlaceName("banner"); removePlaceName("capital");
253 removePlaceName("much"); removePlaceName("sidney");
254 removePlaceName("media"); removePlaceName("protection");
255
256 addPlaceName("United States");
257 }
258}
259
260// TreeMap<Character, Integer> charCount = new TreeMap<Character, Integer>();
261// int upperCount = 0;
262// int lowerCount = 0;
263// int numbers = 0;
264// int arabicCharacters = 0;
265// int chineseCharacters = 0;
266
267// if (!charCount.containsKey(placeName.charAt(placeName.length() - 1)))
268// {
269// charCount.put(placeName.charAt(placeName.length() - 1), 1);
270// }
271// else
272// {
273// charCount.put(placeName.charAt(placeName.length() - 1),
274// charCount.get(placeName.charAt(placeName.length() - 1)) + 1);
275// }
276
277// for (Character c : charCount.keySet())
278// {
279// System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c));
280// }
281//
282// System.out.println("Upper -> " + upperCount);
283// System.out.println("Lower -> " + lowerCount);
284// System.out.println("Number -> " + numbers);
285// System.out.println("Arabic -> " + arabicCharacters);
286// System.out.println("Chinese -> " + chineseCharacters);
287
288// if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z')
289// {
290// upperCount++;
291// }
292//
293// if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z')
294// {
295// lowerCount++;
296// }
297//
298// if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9')
299// {
300// numbers++;
301// }
302//
303// if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711)
304// {
305// arabicCharacters++;
306// }
307//
308// if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016)
309// {
310// chineseCharacters++;
311// }
312//
313// if (!charCount.containsKey(placeName.charAt(i)))
314// {
315// charCount.put(placeName.charAt(i), 1);
316// }
317// else
318// {
319// charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1);
320// }
Note: See TracBrowser for help on using the repository browser.