1 | package org.greenstone.server;
|
---|
2 |
|
---|
3 | import java.io.BufferedReader;
|
---|
4 | import java.io.BufferedWriter;
|
---|
5 | import java.io.FileReader;
|
---|
6 | import java.io.FileWriter;
|
---|
7 | import java.io.Serializable;
|
---|
8 | import java.sql.Connection;
|
---|
9 | import java.sql.DriverManager;
|
---|
10 | import java.sql.Statement;
|
---|
11 | import java.util.ArrayList;
|
---|
12 | import java.util.HashMap;
|
---|
13 | import java.util.HashSet;
|
---|
14 |
|
---|
15 | import com.sun.org.apache.bcel.internal.generic.NEW;
|
---|
16 |
|
---|
17 | public class GazetteerTrieType5 implements Serializable
|
---|
18 | {
|
---|
19 | private static final long serialVersionUID = -959184305931535981L;
|
---|
20 |
|
---|
21 | int _nameCount = 0;
|
---|
22 |
|
---|
23 | HashMap<String, Integer> _gazetteer = new HashMap<String, Integer>();
|
---|
24 |
|
---|
25 | /**
|
---|
26 | * Default constructor
|
---|
27 | */
|
---|
28 | public GazetteerTrieType5()
|
---|
29 | {
|
---|
30 | }
|
---|
31 |
|
---|
32 | /**
|
---|
33 | * Contructor that takes a filename to generate the gazetteer
|
---|
34 | *
|
---|
35 | * @param filename
|
---|
36 | * is the name of the file to use to generate the gazetteer with
|
---|
37 | */
|
---|
38 | public GazetteerTrieType5(String filename)
|
---|
39 | {
|
---|
40 | System.out.println("Loading " + filename + " as gazetteer");
|
---|
41 | try
|
---|
42 | {
|
---|
43 | BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename));
|
---|
44 | String line = "";
|
---|
45 |
|
---|
46 | ArrayList<String> allPlaceNames = new ArrayList<String>();
|
---|
47 |
|
---|
48 | int count = 0;
|
---|
49 | while ((line = gazetteerFile.readLine()) != null)
|
---|
50 | {
|
---|
51 | if (count++ % 10000 == 0)
|
---|
52 | {
|
---|
53 | System.out.println(count + " entries loaded");
|
---|
54 | }
|
---|
55 | // The file is tab seperated so split it by tabs
|
---|
56 | String[] columns = line.split("\t");
|
---|
57 |
|
---|
58 | ArrayList<String> placeNames = new ArrayList<String>();
|
---|
59 |
|
---|
60 | String mainPlaceName = GazetteerHelper.getMainPlaceName(columns);
|
---|
61 | if (mainPlaceName != null)
|
---|
62 | {
|
---|
63 | placeNames.add(mainPlaceName);
|
---|
64 | }
|
---|
65 |
|
---|
66 | ArrayList<String> alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns);
|
---|
67 | if (alternatePlaceNames != null)
|
---|
68 | {
|
---|
69 | placeNames.addAll(alternatePlaceNames);
|
---|
70 | }
|
---|
71 |
|
---|
72 | allPlaceNames.addAll(placeNames);
|
---|
73 |
|
---|
74 | // Add the place names to the trie
|
---|
75 | for (String placeName : placeNames)
|
---|
76 | {
|
---|
77 | this.addPlaceName(placeName);
|
---|
78 | }
|
---|
79 | }
|
---|
80 | }
|
---|
81 | catch (Exception ex)
|
---|
82 | {
|
---|
83 | ex.printStackTrace();
|
---|
84 | }
|
---|
85 | removeAmbiguousPlaceNames();
|
---|
86 | }
|
---|
87 |
|
---|
88 | /**
|
---|
89 | * Removes a place name from the trie
|
---|
90 | *
|
---|
91 | * @param placeName
|
---|
92 | * is the place name to be removed
|
---|
93 | * @return true if sucessful and false if the place name did not exist
|
---|
94 | */
|
---|
95 |
|
---|
96 | public void removePlaceName(String placeName)
|
---|
97 | {
|
---|
98 | _gazetteer.remove(placeName);
|
---|
99 | }
|
---|
100 |
|
---|
101 | /**
|
---|
102 | * Adds a place name to the trie
|
---|
103 | *
|
---|
104 | * @param placeName
|
---|
105 | * is the place name to add
|
---|
106 | */
|
---|
107 | public void addPlaceName(String placeName)
|
---|
108 | {
|
---|
109 | String[] words = placeName.split(" ");
|
---|
110 |
|
---|
111 | for (int i = 0; i < (words.length - 1); i++)
|
---|
112 | {
|
---|
113 | _gazetteer.put(words[i], 0);
|
---|
114 | }
|
---|
115 | _gazetteer.put(words[words.length-1], 1);
|
---|
116 | }
|
---|
117 |
|
---|
118 | /**
|
---|
119 | * Checks to see if a place name exists in the trie
|
---|
120 | *
|
---|
121 | * @param placeName
|
---|
122 | * is the place name to check
|
---|
123 | * @return 1 if the place name exists 0 if the place name does not exist but
|
---|
124 | * there might be a match further down the trie -1 if the place name
|
---|
125 | * does not exist and the trie is at a dead end
|
---|
126 | */
|
---|
127 | public int checkPlaceName(String placeName)
|
---|
128 | {
|
---|
129 | String[] words = placeName.split(" ");
|
---|
130 |
|
---|
131 | for(int i = 0; i < (words.length - 1); i++)
|
---|
132 | {
|
---|
133 | if(_gazetteer.get(words[i]) == null)
|
---|
134 | {
|
---|
135 | return -1;
|
---|
136 | }
|
---|
137 | }
|
---|
138 |
|
---|
139 | if (_gazetteer.get(words[words.length-1]) == null)
|
---|
140 | {
|
---|
141 | return -1;
|
---|
142 | }
|
---|
143 | else
|
---|
144 | {
|
---|
145 | return _gazetteer.get(words[words.length-1]) ;
|
---|
146 | }
|
---|
147 | }
|
---|
148 |
|
---|
149 | /**
|
---|
150 | * Removes place names that are unlikely to be meant as place names in a
|
---|
151 | * given text
|
---|
152 | *
|
---|
153 | * @param gazetteer
|
---|
154 | * is the gazetteer to remove the place names from
|
---|
155 | */
|
---|
156 | public void removeAmbiguousPlaceNames()
|
---|
157 | {
|
---|
158 | removePlaceName("are");
|
---|
159 | removePlaceName("is");
|
---|
160 | removePlaceName("over");
|
---|
161 | removePlaceName("at");
|
---|
162 | removePlaceName("of");
|
---|
163 | removePlaceName("to");
|
---|
164 | removePlaceName("rule");
|
---|
165 | removePlaceName("time");
|
---|
166 | removePlaceName("real");
|
---|
167 | removePlaceName("national");
|
---|
168 | removePlaceName("early");
|
---|
169 | removePlaceName("by");
|
---|
170 | removePlaceName("as");
|
---|
171 | removePlaceName("eastern");
|
---|
172 | removePlaceName("western");
|
---|
173 | removePlaceName("southern");
|
---|
174 | removePlaceName("northern");
|
---|
175 | removePlaceName("east");
|
---|
176 | removePlaceName("west");
|
---|
177 | removePlaceName("south");
|
---|
178 | removePlaceName("north");
|
---|
179 | removePlaceName("this");
|
---|
180 | removePlaceName("between");
|
---|
181 | removePlaceName("many");
|
---|
182 | removePlaceName("strong");
|
---|
183 | removePlaceName("economy");
|
---|
184 | removePlaceName("mall");
|
---|
185 | removePlaceName("they");
|
---|
186 | removePlaceName("do");
|
---|
187 | removePlaceName("image");
|
---|
188 | removePlaceName("republic");
|
---|
189 | removePlaceName("section");
|
---|
190 | removePlaceName("dollar");
|
---|
191 | removePlaceName("index");
|
---|
192 | removePlaceName("day");
|
---|
193 | removePlaceName("council");
|
---|
194 | removePlaceName("use");
|
---|
195 | removePlaceName("log");
|
---|
196 | removePlaceName("logo");
|
---|
197 | removePlaceName("best");
|
---|
198 | removePlaceName("go");
|
---|
199 | removePlaceName("portal");
|
---|
200 | removePlaceName("list");
|
---|
201 | removePlaceName("english");
|
---|
202 | removePlaceName("page");
|
---|
203 | removePlaceName("see");
|
---|
204 | removePlaceName("ocean");
|
---|
205 | removePlaceName("island");
|
---|
206 | removePlaceName("x");
|
---|
207 | removePlaceName("country");
|
---|
208 | removePlaceName("colony");
|
---|
209 | removePlaceName("christian");
|
---|
210 | removePlaceName("black");
|
---|
211 | removePlaceName("independence");
|
---|
212 | removePlaceName("war");
|
---|
213 | removePlaceName("no");
|
---|
214 | removePlaceName("continental");
|
---|
215 | removePlaceName("force");
|
---|
216 | removePlaceName("reform");
|
---|
217 | removePlaceName("rush");
|
---|
218 | removePlaceName("read");
|
---|
219 | removePlaceName("none");
|
---|
220 | removePlaceName("justice");
|
---|
221 | removePlaceName("font");
|
---|
222 | removePlaceName("u");
|
---|
223 | removePlaceName("y");
|
---|
224 | removePlaceName("normal");
|
---|
225 | removePlaceName("center");
|
---|
226 | removePlaceName("date");
|
---|
227 | removePlaceName("story");
|
---|
228 | removePlaceName("union");
|
---|
229 | removePlaceName("supreme");
|
---|
230 | removePlaceName("house");
|
---|
231 | removePlaceName("court");
|
---|
232 | removePlaceName("data");
|
---|
233 | removePlaceName("energy");
|
---|
234 | removePlaceName("white");
|
---|
235 | removePlaceName("universal");
|
---|
236 | removePlaceName("protection");
|
---|
237 | removePlaceName("great");
|
---|
238 | removePlaceName("star");
|
---|
239 | removePlaceName("banner");
|
---|
240 | removePlaceName("capital");
|
---|
241 | removePlaceName("much");
|
---|
242 | removePlaceName("sidney");
|
---|
243 | removePlaceName("media");
|
---|
244 | removePlaceName("protection");
|
---|
245 |
|
---|
246 | addPlaceName("United States");
|
---|
247 | }
|
---|
248 | }
|
---|
249 |
|
---|
250 | // TreeMap<Character, Integer> charCount = new TreeMap<Character, Integer>();
|
---|
251 | // int upperCount = 0;
|
---|
252 | // int lowerCount = 0;
|
---|
253 | // int numbers = 0;
|
---|
254 | // int arabicCharacters = 0;
|
---|
255 | // int chineseCharacters = 0;
|
---|
256 |
|
---|
257 | // if (!charCount.containsKey(placeName.charAt(placeName.length() - 1)))
|
---|
258 | // {
|
---|
259 | // charCount.put(placeName.charAt(placeName.length() - 1), 1);
|
---|
260 | // }
|
---|
261 | // else
|
---|
262 | // {
|
---|
263 | // charCount.put(placeName.charAt(placeName.length() - 1),
|
---|
264 | // charCount.get(placeName.charAt(placeName.length() - 1)) + 1);
|
---|
265 | // }
|
---|
266 |
|
---|
267 | // for (Character c : charCount.keySet())
|
---|
268 | // {
|
---|
269 | // System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c));
|
---|
270 | // }
|
---|
271 | //
|
---|
272 | // System.out.println("Upper -> " + upperCount);
|
---|
273 | // System.out.println("Lower -> " + lowerCount);
|
---|
274 | // System.out.println("Number -> " + numbers);
|
---|
275 | // System.out.println("Arabic -> " + arabicCharacters);
|
---|
276 | // System.out.println("Chinese -> " + chineseCharacters);
|
---|
277 |
|
---|
278 | // if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z')
|
---|
279 | // {
|
---|
280 | // upperCount++;
|
---|
281 | // }
|
---|
282 | //
|
---|
283 | // if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z')
|
---|
284 | // {
|
---|
285 | // lowerCount++;
|
---|
286 | // }
|
---|
287 | //
|
---|
288 | // if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9')
|
---|
289 | // {
|
---|
290 | // numbers++;
|
---|
291 | // }
|
---|
292 | //
|
---|
293 | // if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711)
|
---|
294 | // {
|
---|
295 | // arabicCharacters++;
|
---|
296 | // }
|
---|
297 | //
|
---|
298 | // if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016)
|
---|
299 | // {
|
---|
300 | // chineseCharacters++;
|
---|
301 | // }
|
---|
302 | //
|
---|
303 | // if (!charCount.containsKey(placeName.charAt(i)))
|
---|
304 | // {
|
---|
305 | // charCount.put(placeName.charAt(i), 1);
|
---|
306 | // }
|
---|
307 | // else
|
---|
308 | // {
|
---|
309 | // charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1);
|
---|
310 | // } |
---|