1 | package org.greenstone.server;
|
---|
2 |
|
---|
3 | import java.io.BufferedReader;
|
---|
4 | import java.io.BufferedWriter;
|
---|
5 | import java.io.FileReader;
|
---|
6 | import java.io.FileWriter;
|
---|
7 | import java.io.Serializable;
|
---|
8 | import java.sql.Connection;
|
---|
9 | import java.sql.DriverManager;
|
---|
10 | import java.sql.Statement;
|
---|
11 | import java.util.ArrayList;
|
---|
12 |
|
---|
13 | public class GazetteerTrieType3 implements Serializable
|
---|
14 | {
|
---|
15 | private static final long serialVersionUID = -959184305931535981L;
|
---|
16 |
|
---|
17 | int _nameCount = 0;
|
---|
18 |
|
---|
19 | //GazetteerTrieNode _topLevelNode = new GazetteerTrieNode(false);
|
---|
20 | GazetteerTrieTopLevelNode _topLevelNode = new GazetteerTrieTopLevelNode(false);
|
---|
21 |
|
---|
22 | Statement _database = null;
|
---|
23 |
|
---|
24 | /**
|
---|
25 | * Default constructor
|
---|
26 | */
|
---|
27 | public GazetteerTrieType3()
|
---|
28 | {
|
---|
29 | }
|
---|
30 |
|
---|
31 | /**
|
---|
32 | * Contructor that takes a filename to generate the gazetteer
|
---|
33 | *
|
---|
34 | * @param filename
|
---|
35 | * is the name of the file to use to generate the gazetteer with
|
---|
36 | */
|
---|
37 | public GazetteerTrieType3(String filename)
|
---|
38 | {
|
---|
39 | System.out.println("Loading " + filename + " as gazetteer");
|
---|
40 | try
|
---|
41 | {
|
---|
42 | BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename));
|
---|
43 | String line = "";
|
---|
44 |
|
---|
45 | ArrayList<String> allPlaceNames = new ArrayList<String>();
|
---|
46 |
|
---|
47 | int count = 0;
|
---|
48 | while ((line = gazetteerFile.readLine()) != null)
|
---|
49 | {
|
---|
50 | if(count++ % 10000 == 0){System.out.println(count + " entries loaded");}
|
---|
51 | // The file is tab seperated so split it by tabs
|
---|
52 | String[] columns = line.split("\t");
|
---|
53 |
|
---|
54 | ArrayList<String> placeNames = new ArrayList<String>();
|
---|
55 |
|
---|
56 | String mainPlaceName = GazetteerHelper.getMainPlaceName(columns);
|
---|
57 | if (mainPlaceName != null)
|
---|
58 | {
|
---|
59 | placeNames.add(mainPlaceName);
|
---|
60 | }
|
---|
61 |
|
---|
62 | ArrayList<String> alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns);
|
---|
63 | if (alternatePlaceNames != null)
|
---|
64 | {
|
---|
65 | placeNames.addAll(alternatePlaceNames);
|
---|
66 | }
|
---|
67 |
|
---|
68 | allPlaceNames.addAll(placeNames);
|
---|
69 |
|
---|
70 | // Add the place names to the trie
|
---|
71 | for (String placeName : placeNames)
|
---|
72 | {
|
---|
73 | this.addPlaceName(placeName);
|
---|
74 | }
|
---|
75 | }
|
---|
76 | }
|
---|
77 | catch (Exception ex)
|
---|
78 | {
|
---|
79 | ex.printStackTrace();
|
---|
80 | }
|
---|
81 | removeAmbiguousPlaceNames();
|
---|
82 | }
|
---|
83 |
|
---|
84 | /**
|
---|
85 | * Removes a place name from the trie
|
---|
86 | *
|
---|
87 | * @param placeName
|
---|
88 | * is the place name to be removed
|
---|
89 | * @return true if sucessful and false if the place name did not exist
|
---|
90 | */
|
---|
91 |
|
---|
92 | public boolean removePlaceName(String placeName)
|
---|
93 | {
|
---|
94 | if (placeName.length() == 0)
|
---|
95 | {
|
---|
96 | _topLevelNode.setNameEnd(false);
|
---|
97 | return true;
|
---|
98 | }
|
---|
99 | if (placeName.length() == 1)
|
---|
100 | {
|
---|
101 | _topLevelNode.getChild(placeName.charAt(0)).setNameEnd(false);
|
---|
102 | return true;
|
---|
103 | }
|
---|
104 | GazetteerTrieNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
|
---|
105 | if(currentNode == null)
|
---|
106 | {
|
---|
107 | System.out.println(placeName.charAt(0) + " = null?");
|
---|
108 | return false;
|
---|
109 | }
|
---|
110 |
|
---|
111 | for (int i = 1; i < placeName.length(); i++)
|
---|
112 | {
|
---|
113 | currentNode = currentNode.getChild(placeName.charAt(i));
|
---|
114 | if (currentNode == null)
|
---|
115 | {
|
---|
116 | return false;
|
---|
117 | }
|
---|
118 | }
|
---|
119 |
|
---|
120 | currentNode.setNameEnd(false);
|
---|
121 | return true;
|
---|
122 | }
|
---|
123 |
|
---|
124 | /**
|
---|
125 | * Adds a place name to the trie
|
---|
126 | *
|
---|
127 | * @param placeName
|
---|
128 | * is the place name to add
|
---|
129 | */
|
---|
130 | public void addPlaceName(String placeName)
|
---|
131 | {
|
---|
132 | if (placeName.length() == 0)
|
---|
133 | {
|
---|
134 | return;
|
---|
135 | }
|
---|
136 |
|
---|
137 | if (placeName.length() == 1)
|
---|
138 | {
|
---|
139 | _topLevelNode.addChild(placeName.charAt(0), true);
|
---|
140 | return;
|
---|
141 | }
|
---|
142 |
|
---|
143 | _topLevelNode.addChild(placeName.charAt(0), false);
|
---|
144 | GazetteerTrieNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
|
---|
145 | for (int i = 1; i < placeName.length() - 1; i++)
|
---|
146 | {
|
---|
147 | currentNode.addChild(placeName.charAt(i), false);
|
---|
148 | currentNode = currentNode.getChild(placeName.charAt(i));
|
---|
149 | }
|
---|
150 |
|
---|
151 | currentNode.addChild(placeName.charAt(placeName.length() - 1), true);
|
---|
152 | }
|
---|
153 |
|
---|
154 | /**
|
---|
155 | * Checks to see if a place name exists in the trie
|
---|
156 | *
|
---|
157 | * @param placeName
|
---|
158 | * is the place name to check
|
---|
159 | * @return 1 if the place name exists 0 if the place name does not exist but
|
---|
160 | * there might be a match further down the trie -1 if the place name
|
---|
161 | * does not exist and the trie is at a dead end
|
---|
162 | */
|
---|
163 | public int checkPlaceName(String placeName)
|
---|
164 | {
|
---|
165 | if (placeName.length() == 0)
|
---|
166 | {
|
---|
167 | return 0;
|
---|
168 | }
|
---|
169 |
|
---|
170 | if (placeName.length() == 1)
|
---|
171 | {
|
---|
172 | GazetteerTrieNode node = _topLevelNode.getChild(placeName.charAt(0));
|
---|
173 | if (node == null)
|
---|
174 | {
|
---|
175 | return -1;
|
---|
176 | }
|
---|
177 |
|
---|
178 | return node.isNameEnd() ? 1 : 0;
|
---|
179 | }
|
---|
180 |
|
---|
181 | GazetteerTrieNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
|
---|
182 | if (currentNode == null)
|
---|
183 | {
|
---|
184 | return -1;
|
---|
185 | }
|
---|
186 | for (int i = 1; i < placeName.length(); i++)
|
---|
187 | {
|
---|
188 | currentNode = currentNode.getChild(placeName.charAt(i));
|
---|
189 | if (currentNode == null)
|
---|
190 | {
|
---|
191 | return -1;
|
---|
192 | }
|
---|
193 | }
|
---|
194 |
|
---|
195 | if (currentNode.isNameEnd())
|
---|
196 | {
|
---|
197 | return 1;
|
---|
198 | }
|
---|
199 | else
|
---|
200 | {
|
---|
201 | return 0;
|
---|
202 | }
|
---|
203 | }
|
---|
204 |
|
---|
205 | /**
|
---|
206 | * Removes place names that are unlikely to be meant as place names in a given text
|
---|
207 | * @param gazetteer is the gazetteer to remove the place names from
|
---|
208 | */
|
---|
209 | public void removeAmbiguousPlaceNames()
|
---|
210 | {
|
---|
211 | removePlaceName("are"); removePlaceName("is");
|
---|
212 | removePlaceName("over"); removePlaceName("at");
|
---|
213 | removePlaceName("of"); removePlaceName("to");
|
---|
214 | removePlaceName("rule"); removePlaceName("time");
|
---|
215 | removePlaceName("real"); removePlaceName("national");
|
---|
216 | removePlaceName("early"); removePlaceName("by");
|
---|
217 | removePlaceName("as"); removePlaceName("eastern");
|
---|
218 | removePlaceName("western"); removePlaceName("southern");
|
---|
219 | removePlaceName("northern"); removePlaceName("east");
|
---|
220 | removePlaceName("west"); removePlaceName("south");
|
---|
221 | removePlaceName("north"); removePlaceName("this");
|
---|
222 | removePlaceName("between"); removePlaceName("many");
|
---|
223 | removePlaceName("strong"); removePlaceName("economy");
|
---|
224 | removePlaceName("mall"); removePlaceName("they");
|
---|
225 | removePlaceName("do"); removePlaceName("image");
|
---|
226 | removePlaceName("republic"); removePlaceName("section");
|
---|
227 | removePlaceName("dollar"); removePlaceName("index");
|
---|
228 | removePlaceName("day"); removePlaceName("council");
|
---|
229 | removePlaceName("use"); removePlaceName("log");
|
---|
230 | removePlaceName("logo"); removePlaceName("best");
|
---|
231 | removePlaceName("go"); removePlaceName("portal");
|
---|
232 | removePlaceName("list"); removePlaceName("english");
|
---|
233 | removePlaceName("page"); removePlaceName("see");
|
---|
234 | removePlaceName("ocean"); removePlaceName("island");
|
---|
235 | removePlaceName("x"); removePlaceName("country");
|
---|
236 | removePlaceName("colony"); removePlaceName("christian");
|
---|
237 | removePlaceName("black"); removePlaceName("independence");
|
---|
238 | removePlaceName("war"); removePlaceName("no");
|
---|
239 | removePlaceName("continental"); removePlaceName("continental");
|
---|
240 | removePlaceName("force"); removePlaceName("reform");
|
---|
241 | removePlaceName("rush"); removePlaceName("read");
|
---|
242 | removePlaceName("none"); removePlaceName("justice");
|
---|
243 | removePlaceName("font"); removePlaceName("u");
|
---|
244 | removePlaceName("y"); removePlaceName("normal");
|
---|
245 | removePlaceName("center"); removePlaceName("date");
|
---|
246 | removePlaceName("story"); removePlaceName("union");
|
---|
247 | removePlaceName("supreme"); removePlaceName("house");
|
---|
248 | removePlaceName("court"); removePlaceName("data");
|
---|
249 | removePlaceName("energy"); removePlaceName("white");
|
---|
250 | removePlaceName("universal"); removePlaceName("protection");
|
---|
251 | removePlaceName("great"); removePlaceName("star");
|
---|
252 | removePlaceName("banner"); removePlaceName("capital");
|
---|
253 | removePlaceName("much"); removePlaceName("sidney");
|
---|
254 | removePlaceName("media"); removePlaceName("protection");
|
---|
255 |
|
---|
256 | addPlaceName("United States");
|
---|
257 | }
|
---|
258 | }
|
---|
259 |
|
---|
260 | // TreeMap<Character, Integer> charCount = new TreeMap<Character, Integer>();
|
---|
261 | // int upperCount = 0;
|
---|
262 | // int lowerCount = 0;
|
---|
263 | // int numbers = 0;
|
---|
264 | // int arabicCharacters = 0;
|
---|
265 | // int chineseCharacters = 0;
|
---|
266 |
|
---|
267 | // if (!charCount.containsKey(placeName.charAt(placeName.length() - 1)))
|
---|
268 | // {
|
---|
269 | // charCount.put(placeName.charAt(placeName.length() - 1), 1);
|
---|
270 | // }
|
---|
271 | // else
|
---|
272 | // {
|
---|
273 | // charCount.put(placeName.charAt(placeName.length() - 1),
|
---|
274 | // charCount.get(placeName.charAt(placeName.length() - 1)) + 1);
|
---|
275 | // }
|
---|
276 |
|
---|
277 | // for (Character c : charCount.keySet())
|
---|
278 | // {
|
---|
279 | // System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c));
|
---|
280 | // }
|
---|
281 | //
|
---|
282 | // System.out.println("Upper -> " + upperCount);
|
---|
283 | // System.out.println("Lower -> " + lowerCount);
|
---|
284 | // System.out.println("Number -> " + numbers);
|
---|
285 | // System.out.println("Arabic -> " + arabicCharacters);
|
---|
286 | // System.out.println("Chinese -> " + chineseCharacters);
|
---|
287 |
|
---|
288 | // if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z')
|
---|
289 | // {
|
---|
290 | // upperCount++;
|
---|
291 | // }
|
---|
292 | //
|
---|
293 | // if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z')
|
---|
294 | // {
|
---|
295 | // lowerCount++;
|
---|
296 | // }
|
---|
297 | //
|
---|
298 | // if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9')
|
---|
299 | // {
|
---|
300 | // numbers++;
|
---|
301 | // }
|
---|
302 | //
|
---|
303 | // if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711)
|
---|
304 | // {
|
---|
305 | // arabicCharacters++;
|
---|
306 | // }
|
---|
307 | //
|
---|
308 | // if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016)
|
---|
309 | // {
|
---|
310 | // chineseCharacters++;
|
---|
311 | // }
|
---|
312 | //
|
---|
313 | // if (!charCount.containsKey(placeName.charAt(i)))
|
---|
314 | // {
|
---|
315 | // charCount.put(placeName.charAt(i), 1);
|
---|
316 | // }
|
---|
317 | // else
|
---|
318 | // {
|
---|
319 | // charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1);
|
---|
320 | // } |
---|