source: gs3-extensions/atlas-src/trunk/src/org/greenstone/atlas/server/GazetteerTrieType5.java@ 23906

Last change on this file since 23906 was 23906, checked in by sjm84, 13 years ago

Committing most recent version of ATLAS

File size: 7.3 KB
Line 
1package org.greenstone.server;
2
3import java.io.BufferedReader;
4import java.io.FileReader;
5import java.io.Serializable;
6import java.util.ArrayList;
7import java.util.HashMap;
8
9public class GazetteerTrieType5 implements Serializable
10{
11 private static final long serialVersionUID = -959184305931535981L;
12
13 int _nameCount = 0;
14
15 HashMap<String, Integer> _gazetteer = new HashMap<String, Integer>();
16
17 /**
18 * Default constructor
19 */
20 public GazetteerTrieType5()
21 {
22 }
23
24 /**
25 * Contructor that takes a filename to generate the gazetteer
26 *
27 * @param filename
28 * is the name of the file to use to generate the gazetteer with
29 */
30 public GazetteerTrieType5(String filename)
31 {
32 System.out.println("Loading " + filename + " as gazetteer");
33 try
34 {
35 BufferedReader gazetteerFile = new BufferedReader(new FileReader(filename));
36 String line = "";
37
38 ArrayList<String> allPlaceNames = new ArrayList<String>();
39
40 int count = 0;
41 while ((line = gazetteerFile.readLine()) != null)
42 {
43 if (count++ % 10000 == 0)
44 {
45 System.out.println(count + " entries loaded");
46 }
47 // The file is tab seperated so split it by tabs
48 String[] columns = line.split("\t");
49
50 ArrayList<String> placeNames = new ArrayList<String>();
51
52 String mainPlaceName = GazetteerHelper.getMainPlaceName(columns);
53 if (mainPlaceName != null)
54 {
55 placeNames.add(mainPlaceName);
56 }
57
58 ArrayList<String> alternatePlaceNames = GazetteerHelper.getAlternativePlaceNames(columns);
59 if (alternatePlaceNames != null)
60 {
61 placeNames.addAll(alternatePlaceNames);
62 }
63
64 allPlaceNames.addAll(placeNames);
65
66 // Add the place names to the trie
67 for (String placeName : placeNames)
68 {
69 this.addPlaceName(placeName);
70 }
71 }
72 }
73 catch (Exception ex)
74 {
75 ex.printStackTrace();
76 }
77 removeAmbiguousPlaceNames();
78 }
79
80 /**
81 * Removes a place name from the trie
82 *
83 * @param placeName
84 * is the place name to be removed
85 * @return true if sucessful and false if the place name did not exist
86 */
87
88 public void removePlaceName(String placeName)
89 {
90 _gazetteer.remove(placeName);
91 }
92
93 /**
94 * Adds a place name to the trie
95 *
96 * @param placeName
97 * is the place name to add
98 */
99 public void addPlaceName(String placeName)
100 {
101 String[] words = placeName.split(" ");
102
103 for (int i = 0; i < (words.length - 1); i++)
104 {
105 _gazetteer.put(words[i], 0);
106 }
107 _gazetteer.put(words[words.length-1], 1);
108 }
109
110 /**
111 * Checks to see if a place name exists in the trie
112 *
113 * @param placeName
114 * is the place name to check
115 * @return 1 if the place name exists 0 if the place name does not exist but
116 * there might be a match further down the trie -1 if the place name
117 * does not exist and the trie is at a dead end
118 */
119 public int checkPlaceName(String placeName)
120 {
121 String[] words = placeName.split(" ");
122
123 for(int i = 0; i < (words.length - 1); i++)
124 {
125 if(_gazetteer.get(words[i]) == null)
126 {
127 return -1;
128 }
129 }
130
131 if (_gazetteer.get(words[words.length-1]) == null)
132 {
133 return -1;
134 }
135 else
136 {
137 return _gazetteer.get(words[words.length-1]) ;
138 }
139 }
140
141 /**
142 * Removes place names that are unlikely to be meant as place names in a
143 * given text
144 *
145 * @param gazetteer
146 * is the gazetteer to remove the place names from
147 */
148 public void removeAmbiguousPlaceNames()
149 {
150 removePlaceName("are");
151 removePlaceName("is");
152 removePlaceName("over");
153 removePlaceName("at");
154 removePlaceName("of");
155 removePlaceName("to");
156 removePlaceName("rule");
157 removePlaceName("time");
158 removePlaceName("real");
159 removePlaceName("national");
160 removePlaceName("early");
161 removePlaceName("by");
162 removePlaceName("as");
163 removePlaceName("eastern");
164 removePlaceName("western");
165 removePlaceName("southern");
166 removePlaceName("northern");
167 removePlaceName("east");
168 removePlaceName("west");
169 removePlaceName("south");
170 removePlaceName("north");
171 removePlaceName("this");
172 removePlaceName("between");
173 removePlaceName("many");
174 removePlaceName("strong");
175 removePlaceName("economy");
176 removePlaceName("mall");
177 removePlaceName("they");
178 removePlaceName("do");
179 removePlaceName("image");
180 removePlaceName("republic");
181 removePlaceName("section");
182 removePlaceName("dollar");
183 removePlaceName("index");
184 removePlaceName("day");
185 removePlaceName("council");
186 removePlaceName("use");
187 removePlaceName("log");
188 removePlaceName("logo");
189 removePlaceName("best");
190 removePlaceName("go");
191 removePlaceName("portal");
192 removePlaceName("list");
193 removePlaceName("english");
194 removePlaceName("page");
195 removePlaceName("see");
196 removePlaceName("ocean");
197 removePlaceName("island");
198 removePlaceName("x");
199 removePlaceName("country");
200 removePlaceName("colony");
201 removePlaceName("christian");
202 removePlaceName("black");
203 removePlaceName("independence");
204 removePlaceName("war");
205 removePlaceName("no");
206 removePlaceName("continental");
207 removePlaceName("force");
208 removePlaceName("reform");
209 removePlaceName("rush");
210 removePlaceName("read");
211 removePlaceName("none");
212 removePlaceName("justice");
213 removePlaceName("font");
214 removePlaceName("u");
215 removePlaceName("y");
216 removePlaceName("normal");
217 removePlaceName("center");
218 removePlaceName("date");
219 removePlaceName("story");
220 removePlaceName("union");
221 removePlaceName("supreme");
222 removePlaceName("house");
223 removePlaceName("court");
224 removePlaceName("data");
225 removePlaceName("energy");
226 removePlaceName("white");
227 removePlaceName("universal");
228 removePlaceName("protection");
229 removePlaceName("great");
230 removePlaceName("star");
231 removePlaceName("banner");
232 removePlaceName("capital");
233 removePlaceName("much");
234 removePlaceName("sidney");
235 removePlaceName("media");
236 removePlaceName("protection");
237
238 addPlaceName("United States");
239 }
240}
241
242// TreeMap<Character, Integer> charCount = new TreeMap<Character, Integer>();
243// int upperCount = 0;
244// int lowerCount = 0;
245// int numbers = 0;
246// int arabicCharacters = 0;
247// int chineseCharacters = 0;
248
249// if (!charCount.containsKey(placeName.charAt(placeName.length() - 1)))
250// {
251// charCount.put(placeName.charAt(placeName.length() - 1), 1);
252// }
253// else
254// {
255// charCount.put(placeName.charAt(placeName.length() - 1),
256// charCount.get(placeName.charAt(placeName.length() - 1)) + 1);
257// }
258
259// for (Character c : charCount.keySet())
260// {
261// System.out.println(c + " (" + (int) c + ") -> " + charCount.get(c));
262// }
263//
264// System.out.println("Upper -> " + upperCount);
265// System.out.println("Lower -> " + lowerCount);
266// System.out.println("Number -> " + numbers);
267// System.out.println("Arabic -> " + arabicCharacters);
268// System.out.println("Chinese -> " + chineseCharacters);
269
270// if (placeName.charAt(i) >= 'A' && placeName.charAt(i) <= 'Z')
271// {
272// upperCount++;
273// }
274//
275// if (placeName.charAt(i) >= 'a' && placeName.charAt(i) <= 'z')
276// {
277// lowerCount++;
278// }
279//
280// if (placeName.charAt(i) >= '0' && placeName.charAt(i) <= '9')
281// {
282// numbers++;
283// }
284//
285// if (placeName.charAt(i) >= 1569 && placeName.charAt(i) <= 1711)
286// {
287// arabicCharacters++;
288// }
289//
290// if (placeName.charAt(i) >= 12293 && placeName.charAt(i) <= 64016)
291// {
292// chineseCharacters++;
293// }
294//
295// if (!charCount.containsKey(placeName.charAt(i)))
296// {
297// charCount.put(placeName.charAt(i), 1);
298// }
299// else
300// {
301// charCount.put(placeName.charAt(i), charCount.get(placeName.charAt(i)) + 1);
302// }
Note: See TracBrowser for help on using the repository browser.