source: gs3-extensions/atlas-src/trunk/src/org/greenstone/server/GazetteerTrieType5.java@ 22272

Last change on this file since 22272 was 22272, checked in by sjm84, 14 years ago

Initial version of ATLAS as an extension

File size: 7.5 KB
Line 
1package org.greenstone.server;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.FileReader;
6import java.io.FileWriter;
7import java.io.Serializable;
8import java.sql.Connection;
9import java.sql.DriverManager;
10import java.sql.Statement;
11import java.util.ArrayList;
12import java.util.HashMap;
13import java.util.HashSet;
14
15import com.sun.org.apache.bcel.internal.generic.NEW;
16
17public class GazetteerTrieType5 implements Serializable
18{
19 private static final long serialVersionUID = -959184305931535981L;
20
21 int _nameCount = 0;
22
23 HashMap<String, Integer> _gazetteer = new HashMap<String, Integer>();
24
25 /**
26 * Default constructor
27 */
28 public GazetteerTrieType5()
29 {
30 }
31
32 /**
33 * Contructor that takes a filename to generate the gazetteer
34 *
35 * @param filename
36 * is the name of the file to use to generate the gazetteer with
37 */
38 public GazetteerTrieType5(String filename)
39 {
40 System.out.println("Loading " + filename + " as gazetteer");
41 try
42 {
43 BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename));
44 String line = "";
45
46 ArrayList<String> allPlaceNames = new ArrayList<String>();
47
48 int count = 0;
49 while ((line = gazetteerFile.readLine()) != null)
50 {
51 if (count++ % 10000 == 0)
52 {
53 System.out.println(count + " entries loaded");
54 }
55 // The file is tab seperated so split it by tabs
56 String[] columns = line.split("\t");
57
58 ArrayList<String> placeNames = new ArrayList<String>();
59
60 String mainPlaceName = GazetteerHelper.getMainPlaceName(columns);
61 if (mainPlaceName != null)
62 {
63 placeNames.add(mainPlaceName);
64 }
65
66 ArrayList<String> alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns);
67 if (alternatePlaceNames != null)
68 {
69 placeNames.addAll(alternatePlaceNames);
70 }
71
72 allPlaceNames.addAll(placeNames);
73
74 // Add the place names to the trie
75 for (String placeName : placeNames)
76 {
77 this.addPlaceName(placeName);
78 }
79 }
80 }
81 catch (Exception ex)
82 {
83 ex.printStackTrace();
84 }
85 removeAmbiguousPlaceNames();
86 }
87
88 /**
89 * Removes a place name from the trie
90 *
91 * @param placeName
92 * is the place name to be removed
93 * @return true if sucessful and false if the place name did not exist
94 */
95
96 public void removePlaceName(String placeName)
97 {
98 _gazetteer.remove(placeName);
99 }
100
101 /**
102 * Adds a place name to the trie
103 *
104 * @param placeName
105 * is the place name to add
106 */
107 public void addPlaceName(String placeName)
108 {
109 String[] words = placeName.split(" ");
110
111 for (int i = 0; i < (words.length - 1); i++)
112 {
113 _gazetteer.put(words[i], 0);
114 }
115 _gazetteer.put(words[words.length-1], 1);
116 }
117
118 /**
119 * Checks to see if a place name exists in the trie
120 *
121 * @param placeName
122 * is the place name to check
123 * @return 1 if the place name exists 0 if the place name does not exist but
124 * there might be a match further down the trie -1 if the place name
125 * does not exist and the trie is at a dead end
126 */
127 public int checkPlaceName(String placeName)
128 {
129 String[] words = placeName.split(" ");
130
131 for(int i = 0; i < (words.length - 1); i++)
132 {
133 if(_gazetteer.get(words[i]) == null)
134 {
135 return -1;
136 }
137 }
138
139 if (_gazetteer.get(words[words.length-1]) == null)
140 {
141 return -1;
142 }
143 else
144 {
145 return _gazetteer.get(words[words.length-1]) ;
146 }
147 }
148
149 /**
150 * Removes place names that are unlikely to be meant as place names in a
151 * given text
152 *
153 * @param gazetteer
154 * is the gazetteer to remove the place names from
155 */
156 public void removeAmbiguousPlaceNames()
157 {
158 removePlaceName("are");
159 removePlaceName("is");
160 removePlaceName("over");
161 removePlaceName("at");
162 removePlaceName("of");
163 removePlaceName("to");
164 removePlaceName("rule");
165 removePlaceName("time");
166 removePlaceName("real");
167 removePlaceName("national");
168 removePlaceName("early");
169 removePlaceName("by");
170 removePlaceName("as");
171 removePlaceName("eastern");
172 removePlaceName("western");
173 removePlaceName("southern");
174 removePlaceName("northern");
175 removePlaceName("east");
176 removePlaceName("west");
177 removePlaceName("south");
178 removePlaceName("north");
179 removePlaceName("this");
180 removePlaceName("between");
181 removePlaceName("many");
182 removePlaceName("strong");
183 removePlaceName("economy");
184 removePlaceName("mall");
185 removePlaceName("they");
186 removePlaceName("do");
187 removePlaceName("image");
188 removePlaceName("republic");
189 removePlaceName("section");
190 removePlaceName("dollar");
191 removePlaceName("index");
192 removePlaceName("day");
193 removePlaceName("council");
194 removePlaceName("use");
195 removePlaceName("log");
196 removePlaceName("logo");
197 removePlaceName("best");
198 removePlaceName("go");
199 removePlaceName("portal");
200 removePlaceName("list");
201 removePlaceName("english");
202 removePlaceName("page");
203 removePlaceName("see");
204 removePlaceName("ocean");
205 removePlaceName("island");
206 removePlaceName("x");
207 removePlaceName("country");
208 removePlaceName("colony");
209 removePlaceName("christian");
210 removePlaceName("black");
211 removePlaceName("independence");
212 removePlaceName("war");
213 removePlaceName("no");
214 removePlaceName("continental");
215 removePlaceName("force");
216 removePlaceName("reform");
217 removePlaceName("rush");
218 removePlaceName("read");
219 removePlaceName("none");
220 removePlaceName("justice");
221 removePlaceName("font");
222 removePlaceName("u");
223 removePlaceName("y");
224 removePlaceName("normal");
225 removePlaceName("center");
226 removePlaceName("date");
227 removePlaceName("story");
228 removePlaceName("union");
229 removePlaceName("supreme");
230 removePlaceName("house");
231 removePlaceName("court");
232 removePlaceName("data");
233 removePlaceName("energy");
234 removePlaceName("white");
235 removePlaceName("universal");
236 removePlaceName("protection");
237 removePlaceName("great");
238 removePlaceName("star");
239 removePlaceName("banner");
240 removePlaceName("capital");
241 removePlaceName("much");
242 removePlaceName("sidney");
243 removePlaceName("media");
244 removePlaceName("protection");
245
246 addPlaceName("United States");
247 }
248}
249
250// TreeMap<Character, Integer> charCount = new TreeMap<Character, Integer>();
251// int upperCount = 0;
252// int lowerCount = 0;
253// int numbers = 0;
254// int arabicCharacters = 0;
255// int chineseCharacters = 0;
256
257// if (!charCount.containsKey(placeName.charAt(placeName.length() - 1)))
258// {
259// charCount.put(placeName.charAt(placeName.length() - 1), 1);
260// }
261// else
262// {
263// charCount.put(placeName.charAt(placeName.length() - 1),
264// charCount.get(placeName.charAt(placeName.length() - 1)) + 1);
265// }
266
267// for (Character c : charCount.keySet())
268// {
269// System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c));
270// }
271//
272// System.out.println("Upper -> " + upperCount);
273// System.out.println("Lower -> " + lowerCount);
274// System.out.println("Number -> " + numbers);
275// System.out.println("Arabic -> " + arabicCharacters);
276// System.out.println("Chinese -> " + chineseCharacters);
277
278// if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z')
279// {
280// upperCount++;
281// }
282//
283// if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z')
284// {
285// lowerCount++;
286// }
287//
288// if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9')
289// {
290// numbers++;
291// }
292//
293// if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711)
294// {
295// arabicCharacters++;
296// }
297//
298// if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016)
299// {
300// chineseCharacters++;
301// }
302//
303// if (!charCount.containsKey(placeName.charAt(i)))
304// {
305// charCount.put(placeName.charAt(i), 1);
306// }
307// else
308// {
309// charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1);
310// }
Note: See TracBrowser for help on using the repository browser.