1 | package org.greenstone.atlas.server;
|
---|
2 |
|
---|
3 |
|
---|
4 | import java.util.HashMap;
|
---|
5 | import java.util.ArrayList;
|
---|
6 |
|
---|
7 | public class MarkupService
|
---|
8 | {
|
---|
9 | /**
|
---|
10 | * Takes the given text and searches it for place names and then marks up any matches
|
---|
11 | * @param originalText is the given text
|
---|
12 | * @return a marked up version of the text
|
---|
13 | */
|
---|
14 | public String getMarkedUpText(String originalText)
|
---|
15 | {
|
---|
16 | //Get the gazetteer
|
---|
17 | GazetteerTrieType2 gazetteer = new GazetteerTrieType2("/research/sjm84/Msc/Downloads/dataen.txt");
|
---|
18 |
|
---|
19 | removeAmbiguousPlaceNames(gazetteer);
|
---|
20 |
|
---|
21 | //Find the words in the text
|
---|
22 | ArrayList<String> words = findWords(originalText);
|
---|
23 |
|
---|
24 | //Find the matches
|
---|
25 | HashMap<Integer, Integer> matches = findMatches(gazetteer, words);
|
---|
26 |
|
---|
27 | //Return the marked up text
|
---|
28 | return createMarkedUpText(words, matches, originalText);
|
---|
29 | }
|
---|
30 |
|
---|
31 | /**
|
---|
32 | * Takes the given text, list of words and list of matches to create marked up text
|
---|
33 | * @param words is the list of words in the text
|
---|
34 | * @param matches is the list of gazetteer matches in the text
|
---|
35 | * @param originalText is the orignal text that is to be marked up
|
---|
36 | * @return the marked up text
|
---|
37 | */
|
---|
38 | protected String createMarkedUpText(ArrayList<String> words, HashMap<Integer, Integer> matches, String originalText)
|
---|
39 | {
|
---|
40 | StringBuilder markedUpText = new StringBuilder();
|
---|
41 |
|
---|
42 | //Used to count what word in words is being used
|
---|
43 | int wordCount = 0;
|
---|
44 |
|
---|
45 | //Go through each character
|
---|
46 | int i = 0;
|
---|
47 | while(i < originalText.length())
|
---|
48 | {
|
---|
49 | //If the current character in the text is a letter
|
---|
50 | if(Character.isLetter(originalText.charAt(i)))
|
---|
51 | {
|
---|
52 | Integer numOfWordsInPlaceName = matches.get(wordCount);
|
---|
53 |
|
---|
54 | //If the word is a place name
|
---|
55 | if(numOfWordsInPlaceName != null)
|
---|
56 | {
|
---|
57 | int endingCharacters = 0;
|
---|
58 | StringBuilder placeName = new StringBuilder();
|
---|
59 |
|
---|
60 | //Get each word in the place name
|
---|
61 | for(int j = 0; j < numOfWordsInPlaceName; j++)
|
---|
62 | {
|
---|
63 | //Add the word to the place name
|
---|
64 | placeName.append(words.get(wordCount));
|
---|
65 |
|
---|
66 | //Move the corresponding amount of characters ahead
|
---|
67 | i += words.get(wordCount).length();
|
---|
68 | wordCount++;
|
---|
69 |
|
---|
70 | Character c = null;
|
---|
71 | endingCharacters = 0;
|
---|
72 |
|
---|
73 | //Add any characters between the words in the place name
|
---|
74 | while(i < originalText.length() && !Character.isLetter(c = originalText.charAt(i)))
|
---|
75 | {
|
---|
76 | placeName.append(c);
|
---|
77 | i++;
|
---|
78 | endingCharacters++;
|
---|
79 | }
|
---|
80 | }
|
---|
81 |
|
---|
82 | //Mark up the place name
|
---|
83 | markedUpText.append("<span style=\"background-color:yellow\">"
|
---|
84 | + placeName.substring(0, placeName.length() - endingCharacters)
|
---|
85 | + "</span>"
|
---|
86 | + placeName.substring(placeName.length() - endingCharacters, placeName.length()));
|
---|
87 | }
|
---|
88 | //If the word is not a place name
|
---|
89 | else
|
---|
90 | {
|
---|
91 | //Add it to the marked up text as is
|
---|
92 | String word = words.get(wordCount++);
|
---|
93 | markedUpText.append(word);
|
---|
94 | i += word.length();
|
---|
95 | }
|
---|
96 | }
|
---|
97 | //If the current character in the text is not a letter
|
---|
98 | else
|
---|
99 | {
|
---|
100 | //Add it to the marked up text as is
|
---|
101 | markedUpText.append(originalText.charAt(i));
|
---|
102 | i++;
|
---|
103 | }
|
---|
104 | }
|
---|
105 |
|
---|
106 | return markedUpText.toString();
|
---|
107 | }
|
---|
108 |
|
---|
109 | /**
|
---|
110 | * Searches the words of a text for matches in the gazetteer
|
---|
111 | * @param gaz is the gazetter (in trie form) to use
|
---|
112 | * @param text is the list of words to search
|
---|
113 | * @return a list of matches in the format <[first word index], [number of words in match]>
|
---|
114 | */
|
---|
115 | protected HashMap<Integer, Integer> findMatches(GazetteerTrieType2 gaz, ArrayList<String> words)
|
---|
116 | {
|
---|
117 | HashMap<Integer, Integer> matches = new HashMap<Integer, Integer>();
|
---|
118 |
|
---|
119 | //Go through every word in the list of words
|
---|
120 | for(int i = 0; i < words.size(); i++)
|
---|
121 | {
|
---|
122 | int j = i;
|
---|
123 | int result = 0;
|
---|
124 | StringBuilder currentWord = new StringBuilder();
|
---|
125 |
|
---|
126 | //Until a dead end in the trie is reached or there is no more words to append
|
---|
127 | while(j < words.size())
|
---|
128 | {
|
---|
129 | if(j == i && Character.isLowerCase(words.get(j).charAt(0)))
|
---|
130 | {
|
---|
131 | break;
|
---|
132 | }
|
---|
133 |
|
---|
134 | currentWord = currentWord.append(words.get(j));
|
---|
135 |
|
---|
136 | //Check the words in the gazetteer to see if there is a match
|
---|
137 | result = gaz.checkPlaceName(currentWord.toString());
|
---|
138 |
|
---|
139 | //If there is a match
|
---|
140 | if(result == 1)
|
---|
141 | {
|
---|
142 | //Store the match in the form <[first word index], [number of words in match]>
|
---|
143 | //Because shorter words are checked first longer matches will overwrite shorter matches
|
---|
144 | matches.put(i, (j-i) + 1);
|
---|
145 | }
|
---|
146 | else if(result == -1)
|
---|
147 | {
|
---|
148 | break;
|
---|
149 | }
|
---|
150 | j++;
|
---|
151 | currentWord = currentWord.append(" ");
|
---|
152 | }
|
---|
153 | }
|
---|
154 |
|
---|
155 | //Remove any overlapping matches and return
|
---|
156 | return correctMatches(matches);
|
---|
157 | }
|
---|
158 |
|
---|
159 | /**
|
---|
160 | * Takes an array of matches and removes any that overlap
|
---|
161 | * @param matches is the array of matches
|
---|
162 | * @return the corrected array of matches
|
---|
163 | */
|
---|
164 | protected HashMap<Integer, Integer> correctMatches(HashMap<Integer, Integer> matches)
|
---|
165 | {
|
---|
166 | Integer[] keys = matches.keySet().toArray(new Integer[0]);
|
---|
167 |
|
---|
168 | //Go through all the keys in the hash map
|
---|
169 | for(int i = 0; i < keys.length-1; i++)
|
---|
170 | {
|
---|
171 | //Ignore keys that have been removed
|
---|
172 | if(matches.get(keys[i]) == null)
|
---|
173 | {
|
---|
174 | continue;
|
---|
175 | }
|
---|
176 |
|
---|
177 | //Check and see if the current key overlaps any other keys
|
---|
178 | for(int j = keys[i] + 1; j < keys[i] + matches.get(keys[i]); j++)
|
---|
179 | {
|
---|
180 | //If so, remove the keys it overlaps
|
---|
181 | if(matches.get(j) != null)
|
---|
182 | {
|
---|
183 | matches.remove(j);
|
---|
184 | }
|
---|
185 | }
|
---|
186 | }
|
---|
187 |
|
---|
188 | return matches;
|
---|
189 | }
|
---|
190 |
|
---|
191 | /**
|
---|
192 | * Divides the given text into words and returns the list of words
|
---|
193 | * @param text is the text to divide
|
---|
194 | * @return the divided text
|
---|
195 | */
|
---|
196 | protected static ArrayList<String> findWords(String text)
|
---|
197 | {
|
---|
198 | ArrayList<String> words = new ArrayList<String>();
|
---|
199 | StringBuilder currentWord = new StringBuilder();
|
---|
200 |
|
---|
201 | //Go through each character in the text
|
---|
202 | for(int i = 0; i < text.length(); i++)
|
---|
203 | {
|
---|
204 | //If it is a letter then add it to the current word
|
---|
205 | if(Character.isLetter(text.charAt(i)) || text.charAt(i) == '-')
|
---|
206 | {
|
---|
207 | currentWord.append(text.charAt(i));
|
---|
208 | }
|
---|
209 | //If it is not a letter
|
---|
210 | else
|
---|
211 | {
|
---|
212 | //And a word currently is currently being created
|
---|
213 | if(currentWord.length() > 0 && text.charAt(i - currentWord.length() + 1) != '\'')
|
---|
214 | {
|
---|
215 | //Add the current word to the list of words
|
---|
216 | words.add(currentWord.toString());
|
---|
217 |
|
---|
218 | //Delete the current words
|
---|
219 | currentWord.delete(0, currentWord.length());
|
---|
220 | }
|
---|
221 | }
|
---|
222 | }
|
---|
223 |
|
---|
224 | //Add the final word of the text
|
---|
225 | if(currentWord.length() > 0)
|
---|
226 | {
|
---|
227 | words.add(currentWord.toString());
|
---|
228 | }
|
---|
229 |
|
---|
230 | return words;
|
---|
231 | }
|
---|
232 |
|
---|
233 | /**
|
---|
234 | * Removes place names that are unlikely to be meant as place names in a given text
|
---|
235 | * @param gazetteer is the gazetteer to remove the place names from
|
---|
236 | */
|
---|
237 | public void removeAmbiguousPlaceNames(GazetteerTrieType2 gazetteer)
|
---|
238 | {
|
---|
239 | gazetteer.removePlaceName("are"); gazetteer.removePlaceName("is");
|
---|
240 | gazetteer.removePlaceName("over"); gazetteer.removePlaceName("at");
|
---|
241 | gazetteer.removePlaceName("of"); gazetteer.removePlaceName("to");
|
---|
242 | gazetteer.removePlaceName("rule"); gazetteer.removePlaceName("time");
|
---|
243 | gazetteer.removePlaceName("real"); gazetteer.removePlaceName("national");
|
---|
244 | gazetteer.removePlaceName("early"); gazetteer.removePlaceName("by");
|
---|
245 | gazetteer.removePlaceName("as"); gazetteer.removePlaceName("eastern");
|
---|
246 | gazetteer.removePlaceName("western"); gazetteer.removePlaceName("southern");
|
---|
247 | gazetteer.removePlaceName("northern"); gazetteer.removePlaceName("east");
|
---|
248 | gazetteer.removePlaceName("west"); gazetteer.removePlaceName("south");
|
---|
249 | gazetteer.removePlaceName("north"); gazetteer.removePlaceName("this");
|
---|
250 | gazetteer.removePlaceName("between"); gazetteer.removePlaceName("many");
|
---|
251 | gazetteer.removePlaceName("strong"); gazetteer.removePlaceName("economy");
|
---|
252 | gazetteer.removePlaceName("mall"); gazetteer.removePlaceName("they");
|
---|
253 | gazetteer.removePlaceName("do"); gazetteer.removePlaceName("image");
|
---|
254 | gazetteer.removePlaceName("republic"); gazetteer.removePlaceName("section");
|
---|
255 | gazetteer.removePlaceName("dollar"); gazetteer.removePlaceName("index");
|
---|
256 | gazetteer.removePlaceName("day"); gazetteer.removePlaceName("council");
|
---|
257 | gazetteer.removePlaceName("use"); gazetteer.removePlaceName("log");
|
---|
258 | gazetteer.removePlaceName("logo"); gazetteer.removePlaceName("best");
|
---|
259 | gazetteer.removePlaceName("go"); gazetteer.removePlaceName("portal");
|
---|
260 | gazetteer.removePlaceName("list"); gazetteer.removePlaceName("english");
|
---|
261 | gazetteer.removePlaceName("page"); gazetteer.removePlaceName("see");
|
---|
262 | gazetteer.removePlaceName("ocean"); gazetteer.removePlaceName("island");
|
---|
263 | gazetteer.removePlaceName("x"); gazetteer.removePlaceName("country");
|
---|
264 | gazetteer.removePlaceName("colony"); gazetteer.removePlaceName("christian");
|
---|
265 | gazetteer.removePlaceName("black"); gazetteer.removePlaceName("independence");
|
---|
266 | gazetteer.removePlaceName("war"); gazetteer.removePlaceName("no");
|
---|
267 | gazetteer.removePlaceName("continental"); gazetteer.removePlaceName("continental");
|
---|
268 | gazetteer.removePlaceName("force"); gazetteer.removePlaceName("reform");
|
---|
269 | gazetteer.removePlaceName("rush"); gazetteer.removePlaceName("read");
|
---|
270 | gazetteer.removePlaceName("none"); gazetteer.removePlaceName("justice");
|
---|
271 | gazetteer.removePlaceName("font"); gazetteer.removePlaceName("u");
|
---|
272 | gazetteer.removePlaceName("y"); gazetteer.removePlaceName("normal");
|
---|
273 | gazetteer.removePlaceName("center"); gazetteer.removePlaceName("date");
|
---|
274 | gazetteer.removePlaceName("story"); gazetteer.removePlaceName("union");
|
---|
275 | gazetteer.removePlaceName("supreme"); gazetteer.removePlaceName("house");
|
---|
276 | gazetteer.removePlaceName("court"); gazetteer.removePlaceName("data");
|
---|
277 | gazetteer.removePlaceName("energy"); gazetteer.removePlaceName("white");
|
---|
278 | gazetteer.removePlaceName("universal"); gazetteer.removePlaceName("protection");
|
---|
279 | gazetteer.removePlaceName("great"); gazetteer.removePlaceName("star");
|
---|
280 | gazetteer.removePlaceName("banner"); gazetteer.removePlaceName("capital");
|
---|
281 | gazetteer.removePlaceName("much");
|
---|
282 |
|
---|
283 | gazetteer.addPlaceName("United States");
|
---|
284 | }
|
---|
285 | }
|
---|