1 | package org.greenstone.server;
|
---|
2 |
|
---|
3 | import java.io.BufferedReader;
|
---|
4 | import java.io.FileReader;
|
---|
5 | import java.io.Serializable;
|
---|
6 | import java.sql.Statement;
|
---|
7 | import java.util.ArrayList;
|
---|
8 |
|
---|
9 | public class GazetteerTrieType4 implements Serializable
|
---|
10 | {
|
---|
11 | private static final long serialVersionUID = -959184305931535981L;
|
---|
12 |
|
---|
13 | int _nameCount = 0;
|
---|
14 |
|
---|
15 | //GazetteerTrieNode _topLevelNode = new GazetteerTrieNode(false);
|
---|
16 | GazetteerTrieSmallNode _topLevelNode = new GazetteerTrieSmallNode(false);
|
---|
17 |
|
---|
18 | Statement _database = null;
|
---|
19 |
|
---|
20 | /**
|
---|
21 | * Default constructor
|
---|
22 | */
|
---|
23 | public GazetteerTrieType4()
|
---|
24 | {
|
---|
25 | }
|
---|
26 |
|
---|
27 | /**
|
---|
28 | * Contructor that takes a filename to generate the gazetteer
|
---|
29 | *
|
---|
30 | * @param filename
|
---|
31 | * is the name of the file to use to generate the gazetteer with
|
---|
32 | */
|
---|
33 | public GazetteerTrieType4(String filename)
|
---|
34 | {
|
---|
35 | System.out.println("Loading " + filename + " as gazetteer");
|
---|
36 | try
|
---|
37 | {
|
---|
38 | BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename));
|
---|
39 | String line = "";
|
---|
40 |
|
---|
41 | ArrayList<String> allPlaceNames = new ArrayList<String>();
|
---|
42 |
|
---|
43 | int count = 0;
|
---|
44 | while ((line = gazetteerFile.readLine()) != null)
|
---|
45 | {
|
---|
46 | if(count++ % 10000 == 0){System.out.println(count + " entries loaded");}
|
---|
47 | // The file is tab seperated so split it by tabs
|
---|
48 | String[] columns = line.split("\t");
|
---|
49 |
|
---|
50 | ArrayList<String> placeNames = new ArrayList<String>();
|
---|
51 |
|
---|
52 | String mainPlaceName = GazetteerHelper.getMainPlaceName(columns);
|
---|
53 | if (mainPlaceName != null)
|
---|
54 | {
|
---|
55 | placeNames.add(mainPlaceName);
|
---|
56 | }
|
---|
57 |
|
---|
58 | ArrayList<String> alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns);
|
---|
59 | if (alternatePlaceNames != null)
|
---|
60 | {
|
---|
61 | placeNames.addAll(alternatePlaceNames);
|
---|
62 | }
|
---|
63 |
|
---|
64 | allPlaceNames.addAll(placeNames);
|
---|
65 |
|
---|
66 | // Add the place names to the trie
|
---|
67 | for (String placeName : placeNames)
|
---|
68 | {
|
---|
69 | this.addPlaceName(placeName);
|
---|
70 | }
|
---|
71 | }
|
---|
72 | }
|
---|
73 | catch (Exception ex)
|
---|
74 | {
|
---|
75 | ex.printStackTrace();
|
---|
76 | }
|
---|
77 | removeAmbiguousPlaceNames();
|
---|
78 | }
|
---|
79 |
|
---|
80 | /**
|
---|
81 | * Removes a place name from the trie
|
---|
82 | *
|
---|
83 | * @param placeName
|
---|
84 | * is the place name to be removed
|
---|
85 | * @return true if sucessful and false if the place name did not exist
|
---|
86 | */
|
---|
87 |
|
---|
88 | public boolean removePlaceName(String placeName)
|
---|
89 | {
|
---|
90 | if (placeName.length() == 0)
|
---|
91 | {
|
---|
92 | _topLevelNode.setNameEnd(false);
|
---|
93 | return true;
|
---|
94 | }
|
---|
95 | if (placeName.length() == 1)
|
---|
96 | {
|
---|
97 | _topLevelNode.getChild(placeName.charAt(0)).setNameEnd(false);
|
---|
98 | return true;
|
---|
99 | }
|
---|
100 | GazetteerTrieSmallNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
|
---|
101 | if(currentNode == null)
|
---|
102 | {
|
---|
103 | System.out.println(placeName.charAt(0) + " = null?");
|
---|
104 | return false;
|
---|
105 | }
|
---|
106 |
|
---|
107 | for (int i = 1; i < placeName.length(); i++)
|
---|
108 | {
|
---|
109 | currentNode = currentNode.getChild(placeName.charAt(i));
|
---|
110 | if (currentNode == null)
|
---|
111 | {
|
---|
112 | return false;
|
---|
113 | }
|
---|
114 | }
|
---|
115 |
|
---|
116 | currentNode.setNameEnd(false);
|
---|
117 | return true;
|
---|
118 | }
|
---|
119 |
|
---|
120 | /**
|
---|
121 | * Adds a place name to the trie
|
---|
122 | *
|
---|
123 | * @param placeName
|
---|
124 | * is the place name to add
|
---|
125 | */
|
---|
126 | public void addPlaceName(String placeName)
|
---|
127 | {
|
---|
128 | if (placeName.length() == 0)
|
---|
129 | {
|
---|
130 | return;
|
---|
131 | }
|
---|
132 |
|
---|
133 | if (placeName.length() == 1)
|
---|
134 | {
|
---|
135 | _topLevelNode.addChild(placeName.charAt(0), true);
|
---|
136 | return;
|
---|
137 | }
|
---|
138 |
|
---|
139 | _topLevelNode.addChild(placeName.charAt(0), false);
|
---|
140 | GazetteerTrieSmallNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
|
---|
141 | for (int i = 1; i < placeName.length() - 1; i++)
|
---|
142 | {
|
---|
143 | currentNode.addChild(placeName.charAt(i), false);
|
---|
144 | currentNode = currentNode.getChild(placeName.charAt(i));
|
---|
145 | }
|
---|
146 |
|
---|
147 | currentNode.addChild(placeName.charAt(placeName.length() - 1), true);
|
---|
148 | }
|
---|
149 |
|
---|
150 | /**
|
---|
151 | * Checks to see if a place name exists in the trie
|
---|
152 | *
|
---|
153 | * @param placeName
|
---|
154 | * is the place name to check
|
---|
155 | * @return 1 if the place name exists 0 if the place name does not exist but
|
---|
156 | * there might be a match further down the trie -1 if the place name
|
---|
157 | * does not exist and the trie is at a dead end
|
---|
158 | */
|
---|
159 | public int checkPlaceName(String placeName)
|
---|
160 | {
|
---|
161 | if (placeName.length() == 0)
|
---|
162 | {
|
---|
163 | return 0;
|
---|
164 | }
|
---|
165 |
|
---|
166 | if (placeName.length() == 1)
|
---|
167 | {
|
---|
168 | GazetteerTrieSmallNode node = _topLevelNode.getChild(placeName.charAt(0));
|
---|
169 | if (node == null)
|
---|
170 | {
|
---|
171 | return -1;
|
---|
172 | }
|
---|
173 |
|
---|
174 | return node.isNameEnd() ? 1 : 0;
|
---|
175 | }
|
---|
176 |
|
---|
177 | GazetteerTrieSmallNode currentNode = _topLevelNode.getChild(placeName.charAt(0));
|
---|
178 | if (currentNode == null)
|
---|
179 | {
|
---|
180 | return -1;
|
---|
181 | }
|
---|
182 | for (int i = 1; i < placeName.length(); i++)
|
---|
183 | {
|
---|
184 | currentNode = currentNode.getChild(placeName.charAt(i));
|
---|
185 | if (currentNode == null)
|
---|
186 | {
|
---|
187 | return -1;
|
---|
188 | }
|
---|
189 | }
|
---|
190 |
|
---|
191 | if (currentNode.isNameEnd())
|
---|
192 | {
|
---|
193 | return 1;
|
---|
194 | }
|
---|
195 | else
|
---|
196 | {
|
---|
197 | return 0;
|
---|
198 | }
|
---|
199 | }
|
---|
200 |
|
---|
201 | /**
|
---|
202 | * Removes place names that are unlikely to be meant as place names in a given text
|
---|
203 | * @param gazetteer is the gazetteer to remove the place names from
|
---|
204 | */
|
---|
205 | public void removeAmbiguousPlaceNames()
|
---|
206 | {
|
---|
207 | removePlaceName("are"); removePlaceName("is");
|
---|
208 | removePlaceName("over"); removePlaceName("at");
|
---|
209 | removePlaceName("of"); removePlaceName("to");
|
---|
210 | removePlaceName("rule"); removePlaceName("time");
|
---|
211 | removePlaceName("real"); removePlaceName("national");
|
---|
212 | removePlaceName("early"); removePlaceName("by");
|
---|
213 | removePlaceName("as"); removePlaceName("eastern");
|
---|
214 | removePlaceName("western"); removePlaceName("southern");
|
---|
215 | removePlaceName("northern"); removePlaceName("east");
|
---|
216 | removePlaceName("west"); removePlaceName("south");
|
---|
217 | removePlaceName("north"); removePlaceName("this");
|
---|
218 | removePlaceName("between"); removePlaceName("many");
|
---|
219 | removePlaceName("strong"); removePlaceName("economy");
|
---|
220 | removePlaceName("mall"); removePlaceName("they");
|
---|
221 | removePlaceName("do"); removePlaceName("image");
|
---|
222 | removePlaceName("republic"); removePlaceName("section");
|
---|
223 | removePlaceName("dollar"); removePlaceName("index");
|
---|
224 | removePlaceName("day"); removePlaceName("council");
|
---|
225 | removePlaceName("use"); removePlaceName("log");
|
---|
226 | removePlaceName("logo"); removePlaceName("best");
|
---|
227 | removePlaceName("go"); removePlaceName("portal");
|
---|
228 | removePlaceName("list"); removePlaceName("english");
|
---|
229 | removePlaceName("page"); removePlaceName("see");
|
---|
230 | removePlaceName("ocean"); removePlaceName("island");
|
---|
231 | removePlaceName("x"); removePlaceName("country");
|
---|
232 | removePlaceName("colony"); removePlaceName("christian");
|
---|
233 | removePlaceName("black"); removePlaceName("independence");
|
---|
234 | removePlaceName("war"); removePlaceName("no");
|
---|
235 | removePlaceName("continental"); removePlaceName("continental");
|
---|
236 | removePlaceName("force"); removePlaceName("reform");
|
---|
237 | removePlaceName("rush"); removePlaceName("read");
|
---|
238 | removePlaceName("none"); removePlaceName("justice");
|
---|
239 | removePlaceName("font"); removePlaceName("u");
|
---|
240 | removePlaceName("y"); removePlaceName("normal");
|
---|
241 | removePlaceName("center"); removePlaceName("date");
|
---|
242 | removePlaceName("story"); removePlaceName("union");
|
---|
243 | removePlaceName("supreme"); removePlaceName("house");
|
---|
244 | removePlaceName("court"); removePlaceName("data");
|
---|
245 | removePlaceName("energy"); removePlaceName("white");
|
---|
246 | removePlaceName("universal"); removePlaceName("protection");
|
---|
247 | removePlaceName("great"); removePlaceName("star");
|
---|
248 | removePlaceName("banner"); removePlaceName("capital");
|
---|
249 | removePlaceName("much"); removePlaceName("sidney");
|
---|
250 | removePlaceName("media"); removePlaceName("protection");
|
---|
251 |
|
---|
252 | addPlaceName("United States");
|
---|
253 | }
|
---|
254 | }
|
---|
255 |
|
---|
256 | // TreeMap<Character, Integer> charCount = new TreeMap<Character, Integer>();
|
---|
257 | // int upperCount = 0;
|
---|
258 | // int lowerCount = 0;
|
---|
259 | // int numbers = 0;
|
---|
260 | // int arabicCharacters = 0;
|
---|
261 | // int chineseCharacters = 0;
|
---|
262 |
|
---|
263 | // if (!charCount.containsKey(placeName.charAt(placeName.length() - 1)))
|
---|
264 | // {
|
---|
265 | // charCount.put(placeName.charAt(placeName.length() - 1), 1);
|
---|
266 | // }
|
---|
267 | // else
|
---|
268 | // {
|
---|
269 | // charCount.put(placeName.charAt(placeName.length() - 1),
|
---|
270 | // charCount.get(placeName.charAt(placeName.length() - 1)) + 1);
|
---|
271 | // }
|
---|
272 |
|
---|
273 | // for (Character c : charCount.keySet())
|
---|
274 | // {
|
---|
275 | // System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c));
|
---|
276 | // }
|
---|
277 | //
|
---|
278 | // System.out.println("Upper -> " + upperCount);
|
---|
279 | // System.out.println("Lower -> " + lowerCount);
|
---|
280 | // System.out.println("Number -> " + numbers);
|
---|
281 | // System.out.println("Arabic -> " + arabicCharacters);
|
---|
282 | // System.out.println("Chinese -> " + chineseCharacters);
|
---|
283 |
|
---|
284 | // if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z')
|
---|
285 | // {
|
---|
286 | // upperCount++;
|
---|
287 | // }
|
---|
288 | //
|
---|
289 | // if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z')
|
---|
290 | // {
|
---|
291 | // lowerCount++;
|
---|
292 | // }
|
---|
293 | //
|
---|
294 | // if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9')
|
---|
295 | // {
|
---|
296 | // numbers++;
|
---|
297 | // }
|
---|
298 | //
|
---|
299 | // if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711)
|
---|
300 | // {
|
---|
301 | // arabicCharacters++;
|
---|
302 | // }
|
---|
303 | //
|
---|
304 | // if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016)
|
---|
305 | // {
|
---|
306 | // chineseCharacters++;
|
---|
307 | // }
|
---|
308 | //
|
---|
309 | // if (!charCount.containsKey(placeName.charAt(i)))
|
---|
310 | // {
|
---|
311 | // charCount.put(placeName.charAt(i), 1);
|
---|
312 | // }
|
---|
313 | // else
|
---|
314 | // {
|
---|
315 | // charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1);
|
---|
316 | // } |
---|