source: gs3-extensions/atlas-src/trunk/src/org/greenstone/server/MarkupService.java@ 22272

Last change on this file since 22272 was 22272, checked in by sjm84, 14 years ago

Initial version of ATLAS as an extension

File size: 9.7 KB
Line 
1package org.greenstone.server;
2
3
4import java.util.HashMap;
5import java.util.ArrayList;
6
7public class MarkupService
8{
9 /**
10 * Takes the given text and searches it for place names and then marks up any matches
11 * @param originalText is the given text
12 * @return a marked up version of the text
13 */
14 public String getMarkedUpText(String originalText)
15 {
16 //Get the gazetteer
17 GazetteerTrieType2 gazetteer = new GazetteerTrieType2("/research/sjm84/Msc/Downloads/dataen.txt");
18
19 removeAmbiguousPlaceNames(gazetteer);
20
21 //Find the words in the text
22 ArrayList<String> words = findWords(originalText);
23
24 //Find the matches
25 HashMap<Integer, Integer> matches = findMatches(gazetteer, words);
26
27 //Return the marked up text
28 return createMarkedUpText(words, matches, originalText);
29 }
30
31 /**
32 * Takes the given text, list of words and list of matches to create marked up text
33 * @param words is the list of words in the text
34 * @param matches is the list of gazetteer matches in the text
35 * @param originalText is the orignal text that is to be marked up
36 * @return the marked up text
37 */
38 protected String createMarkedUpText(ArrayList<String> words, HashMap<Integer, Integer> matches, String originalText)
39 {
40 StringBuilder markedUpText = new StringBuilder();
41
42 //Used to count what word in words is being used
43 int wordCount = 0;
44
45 //Go through each character
46 int i = 0;
47 while(i < originalText.length())
48 {
49 //If the current character in the text is a letter
50 if(Character.isLetter(originalText.charAt(i)))
51 {
52 Integer numOfWordsInPlaceName = matches.get(wordCount);
53
54 //If the word is a place name
55 if(numOfWordsInPlaceName != null)
56 {
57 int endingCharacters = 0;
58 StringBuilder placeName = new StringBuilder();
59
60 //Get each word in the place name
61 for(int j = 0; j < numOfWordsInPlaceName; j++)
62 {
63 //Add the word to the place name
64 placeName.append(words.get(wordCount));
65
66 //Move the corresponding amount of characters ahead
67 i += words.get(wordCount).length();
68 wordCount++;
69
70 Character c = null;
71 endingCharacters = 0;
72
73 //Add any characters between the words in the place name
74 while(i < originalText.length() && !Character.isLetter(c = originalText.charAt(i)))
75 {
76 placeName.append(c);
77 i++;
78 endingCharacters++;
79 }
80 }
81
82 //Mark up the place name
83 markedUpText.append("<span style=\"background-color:yellow\">"
84 + placeName.substring(0, placeName.length() - endingCharacters)
85 + "</span>"
86 + placeName.substring(placeName.length() - endingCharacters, placeName.length()));
87 }
88 //If the word is not a place name
89 else
90 {
91 //Add it to the marked up text as is
92 String word = words.get(wordCount++);
93 markedUpText.append(word);
94 i += word.length();
95 }
96 }
97 //If the current character in the text is not a letter
98 else
99 {
100 //Add it to the marked up text as is
101 markedUpText.append(originalText.charAt(i));
102 i++;
103 }
104 }
105
106 return markedUpText.toString();
107 }
108
109 /**
110 * Searches the words of a text for matches in the gazetteer
111 * @param gaz is the gazetter (in trie form) to use
112 * @param text is the list of words to search
113 * @return a list of matches in the format <[first word index], [number of words in match]>
114 */
115 protected HashMap<Integer, Integer> findMatches(GazetteerTrieType2 gaz, ArrayList<String> words)
116 {
117 HashMap<Integer, Integer> matches = new HashMap<Integer, Integer>();
118
119 //Go through every word in the list of words
120 for(int i = 0; i < words.size(); i++)
121 {
122 int j = i;
123 int result = 0;
124 StringBuilder currentWord = new StringBuilder();
125
126 //Until a dead end in the trie is reached or there is no more words to append
127 while(j < words.size())
128 {
129 if(j == i && Character.isLowerCase(words.get(j).charAt(0)))
130 {
131 break;
132 }
133
134 currentWord = currentWord.append(words.get(j));
135
136 //Check the words in the gazetteer to see if there is a match
137 result = gaz.checkPlaceName(currentWord.toString());
138
139 //If there is a match
140 if(result == 1)
141 {
142 //Store the match in the form <[first word index], [number of words in match]>
143 //Because shorter words are checked first longer matches will overwrite shorter matches
144 matches.put(i, (j-i) + 1);
145 }
146 else if(result == -1)
147 {
148 break;
149 }
150 j++;
151 currentWord = currentWord.append(" ");
152 }
153 }
154
155 //Remove any overlapping matches and return
156 return correctMatches(matches);
157 }
158
159 /**
160 * Takes an array of matches and removes any that overlap
161 * @param matches is the array of matches
162 * @return the corrected array of matches
163 */
164 protected HashMap<Integer, Integer> correctMatches(HashMap<Integer, Integer> matches)
165 {
166 Integer[] keys = matches.keySet().toArray(new Integer[0]);
167
168 //Go through all the keys in the hash map
169 for(int i = 0; i < keys.length-1; i++)
170 {
171 //Ignore keys that have been removed
172 if(matches.get(keys[i]) == null)
173 {
174 continue;
175 }
176
177 //Check and see if the current key overlaps any other keys
178 for(int j = keys[i] + 1; j < keys[i] + matches.get(keys[i]); j++)
179 {
180 //If so, remove the keys it overlaps
181 if(matches.get(j) != null)
182 {
183 matches.remove(j);
184 }
185 }
186 }
187
188 return matches;
189 }
190
191 /**
192 * Divides the given text into words and returns the list of words
193 * @param text is the text to divide
194 * @return the divided text
195 */
196 protected static ArrayList<String> findWords(String text)
197 {
198 ArrayList<String> words = new ArrayList<String>();
199 StringBuilder currentWord = new StringBuilder();
200
201 //Go through each character in the text
202 for(int i = 0; i < text.length(); i++)
203 {
204 //If it is a letter then add it to the current word
205 if(Character.isLetter(text.charAt(i)) || text.charAt(i) == '-')
206 {
207 currentWord.append(text.charAt(i));
208 }
209 //If it is not a letter
210 else
211 {
212 //And a word currently is currently being created
213 if(currentWord.length() > 0 && text.charAt(i - currentWord.length() + 1) != '\'')
214 {
215 //Add the current word to the list of words
216 words.add(currentWord.toString());
217
218 //Delete the current words
219 currentWord.delete(0, currentWord.length());
220 }
221 }
222 }
223
224 //Add the final word of the text
225 if(currentWord.length() > 0)
226 {
227 words.add(currentWord.toString());
228 }
229
230 return words;
231 }
232
233 /**
234 * Removes place names that are unlikely to be meant as place names in a given text
235 * @param gazetteer is the gazetteer to remove the place names from
236 */
237 public void removeAmbiguousPlaceNames(GazetteerTrieType2 gazetteer)
238 {
239 gazetteer.removePlaceName("are"); gazetteer.removePlaceName("is");
240 gazetteer.removePlaceName("over"); gazetteer.removePlaceName("at");
241 gazetteer.removePlaceName("of"); gazetteer.removePlaceName("to");
242 gazetteer.removePlaceName("rule"); gazetteer.removePlaceName("time");
243 gazetteer.removePlaceName("real"); gazetteer.removePlaceName("national");
244 gazetteer.removePlaceName("early"); gazetteer.removePlaceName("by");
245 gazetteer.removePlaceName("as"); gazetteer.removePlaceName("eastern");
246 gazetteer.removePlaceName("western"); gazetteer.removePlaceName("southern");
247 gazetteer.removePlaceName("northern"); gazetteer.removePlaceName("east");
248 gazetteer.removePlaceName("west"); gazetteer.removePlaceName("south");
249 gazetteer.removePlaceName("north"); gazetteer.removePlaceName("this");
250 gazetteer.removePlaceName("between"); gazetteer.removePlaceName("many");
251 gazetteer.removePlaceName("strong"); gazetteer.removePlaceName("economy");
252 gazetteer.removePlaceName("mall"); gazetteer.removePlaceName("they");
253 gazetteer.removePlaceName("do"); gazetteer.removePlaceName("image");
254 gazetteer.removePlaceName("republic"); gazetteer.removePlaceName("section");
255 gazetteer.removePlaceName("dollar"); gazetteer.removePlaceName("index");
256 gazetteer.removePlaceName("day"); gazetteer.removePlaceName("council");
257 gazetteer.removePlaceName("use"); gazetteer.removePlaceName("log");
258 gazetteer.removePlaceName("logo"); gazetteer.removePlaceName("best");
259 gazetteer.removePlaceName("go"); gazetteer.removePlaceName("portal");
260 gazetteer.removePlaceName("list"); gazetteer.removePlaceName("english");
261 gazetteer.removePlaceName("page"); gazetteer.removePlaceName("see");
262 gazetteer.removePlaceName("ocean"); gazetteer.removePlaceName("island");
263 gazetteer.removePlaceName("x"); gazetteer.removePlaceName("country");
264 gazetteer.removePlaceName("colony"); gazetteer.removePlaceName("christian");
265 gazetteer.removePlaceName("black"); gazetteer.removePlaceName("independence");
266 gazetteer.removePlaceName("war"); gazetteer.removePlaceName("no");
267 gazetteer.removePlaceName("continental"); gazetteer.removePlaceName("continental");
268 gazetteer.removePlaceName("force"); gazetteer.removePlaceName("reform");
269 gazetteer.removePlaceName("rush"); gazetteer.removePlaceName("read");
270 gazetteer.removePlaceName("none"); gazetteer.removePlaceName("justice");
271 gazetteer.removePlaceName("font"); gazetteer.removePlaceName("u");
272 gazetteer.removePlaceName("y"); gazetteer.removePlaceName("normal");
273 gazetteer.removePlaceName("center"); gazetteer.removePlaceName("date");
274 gazetteer.removePlaceName("story"); gazetteer.removePlaceName("union");
275 gazetteer.removePlaceName("supreme"); gazetteer.removePlaceName("house");
276 gazetteer.removePlaceName("court"); gazetteer.removePlaceName("data");
277 gazetteer.removePlaceName("energy"); gazetteer.removePlaceName("white");
278 gazetteer.removePlaceName("universal"); gazetteer.removePlaceName("protection");
279 gazetteer.removePlaceName("great"); gazetteer.removePlaceName("star");
280 gazetteer.removePlaceName("banner"); gazetteer.removePlaceName("capital");
281 gazetteer.removePlaceName("much");
282
283 gazetteer.addPlaceName("United States");
284 }
285}
Note: See TracBrowser for help on using the repository browser.