1 | package org.greenstone.server;
|
---|
2 |
|
---|
3 | import gate.util.Out;
|
---|
4 |
|
---|
5 | import java.io.BufferedReader;
|
---|
6 | import java.io.BufferedWriter;
|
---|
7 | import java.io.FileWriter;
|
---|
8 | import java.io.StringReader;
|
---|
9 | import java.util.ArrayList;
|
---|
10 | import java.util.HashMap;
|
---|
11 |
|
---|
12 | import org.greenstone.client.Place;
|
---|
13 |
|
---|
14 | public class PageScanner
|
---|
15 | {
|
---|
16 | // Stores the gazeteer structure (used to verify if words are a place name
|
---|
17 | // or not)
|
---|
18 | protected GazetteerTrieType2 _gazetteer = null;
|
---|
19 |
|
---|
20 | // Stores all of the places in the page that is being examined
|
---|
21 | protected ArrayList<Place> _places = new ArrayList<Place>();
|
---|
22 | protected HashMap<String, ArrayList<Place>> _placeNameMap = new HashMap<String, ArrayList<Place>>();
|
---|
23 | protected String _markedUpText = null;
|
---|
24 | protected HashMap<String, ArrayList<Place>> _placeCache = new HashMap<String, ArrayList<Place>>();
|
---|
25 |
|
---|
26 | protected GateScanner _gateScanner = new GateScanner();
|
---|
27 |
|
---|
28 | // Parameters for score calculations
|
---|
29 | // *********************************
|
---|
30 | protected double _penaltyPercentage = 0.5;
|
---|
31 | protected double _parentBonusPercentage = 0.25;
|
---|
32 | protected double _indirectReferencePenaltyPercentage = 0.25;
|
---|
33 | protected double _parentLimitPercentage = 0.05;
|
---|
34 |
|
---|
35 | protected ArrayList<String> _prevDoc = null;
|
---|
36 | protected String _prevFileName = null;
|
---|
37 |
|
---|
38 | /**
|
---|
39 | * Default constructer. It creates the place data structure and gazetteer
|
---|
40 | * trie structure
|
---|
41 | */
|
---|
42 | public PageScanner(String path)
|
---|
43 | {
|
---|
44 | System.out.println("Loading path = " + path);
|
---|
45 | try
|
---|
46 | {
|
---|
47 | PlaceInformation.init();
|
---|
48 | }
|
---|
49 | catch (Exception ex)
|
---|
50 | {
|
---|
51 | ex.printStackTrace();
|
---|
52 | }
|
---|
53 | System.out.println("Starting loading gazetteer");
|
---|
54 | _gazetteer = new GazetteerTrieType2(path + "/dataen.txt");
|
---|
55 | }
|
---|
56 |
|
---|
57 | /**
|
---|
58 | * Examines the given text to find place names and score them
|
---|
59 | *
|
---|
60 | * @param text
|
---|
61 | * is the text to examine
|
---|
62 | */
|
---|
63 | public ArrayList<Place> examineTextWithGate(String text, String htmlString)
|
---|
64 | {
|
---|
65 | _places.clear();
|
---|
66 |
|
---|
67 | if(htmlString != null && text == null)
|
---|
68 | {
|
---|
69 | text = HTMLParser.removeTags(htmlString);
|
---|
70 | }
|
---|
71 |
|
---|
72 | if (text == null || text.length() < 1 || htmlString == null || htmlString.length() < 1)
|
---|
73 | {
|
---|
74 | return new ArrayList<Place>();
|
---|
75 | }
|
---|
76 |
|
---|
77 | HTMLParser htmlParser = new HTMLParser(htmlString);
|
---|
78 |
|
---|
79 | ArrayList<String> betweenChars = htmlParser.getFullBetweenWordList();
|
---|
80 | ArrayList<String> htmlWords = htmlParser.getFullHTMLWordList();
|
---|
81 |
|
---|
82 | HashMap<Integer, Integer> placeNameIndexesAndLength = new HashMap<Integer, Integer>();
|
---|
83 |
|
---|
84 | try
|
---|
85 | {
|
---|
86 | // Classify all the text with Gate
|
---|
87 | HashMap<String, Word> classifiedWords = _gateScanner.classifyText(text);
|
---|
88 |
|
---|
89 | // Stores the current words being examined
|
---|
90 | StringBuilder currentPotentialPlaceNames = new StringBuilder();
|
---|
91 |
|
---|
92 | // Get the next word from the HTML
|
---|
93 | for (int i = 0; i < htmlWords.size(); i++)
|
---|
94 | {
|
---|
95 | String currentWord = htmlWords.get(i);
|
---|
96 |
|
---|
97 | Word classifiedWord = classifiedWords.get(currentWord);
|
---|
98 | if (classifiedWord == null)
|
---|
99 | {
|
---|
100 | continue;
|
---|
101 | }
|
---|
102 |
|
---|
103 | String classification = classifiedWord.getNextClassification();
|
---|
104 |
|
---|
105 | // If the word does not begin with an uppercase letter then ignore it
|
---|
106 | if (classification == null || Character.isLowerCase(currentWord.charAt(0)) || !(classification.equals("NNP")))
|
---|
107 | {
|
---|
108 | continue;
|
---|
109 | }
|
---|
110 |
|
---|
111 | // Add the first word to the list of words to be examined
|
---|
112 | currentPotentialPlaceNames.append(currentWord);
|
---|
113 |
|
---|
114 | // While the gazetteer does not reach a dead end, add another word and examine them
|
---|
115 | int count = 1;
|
---|
116 | String lastGoodPlaceName = null;
|
---|
117 | while (_gazetteer.checkPlaceName(currentPotentialPlaceNames.toString()) != -1)
|
---|
118 | {
|
---|
119 | // If the words are a place name then store it
|
---|
120 | if (_gazetteer.checkPlaceName(currentPotentialPlaceNames.toString()) == 1)
|
---|
121 | {
|
---|
122 | lastGoodPlaceName = currentPotentialPlaceNames.toString();
|
---|
123 | }
|
---|
124 |
|
---|
125 | // If it is not the end of the words the add the next word
|
---|
126 | if (i + count < htmlWords.size())
|
---|
127 | {
|
---|
128 | currentPotentialPlaceNames.append(betweenChars.get(i + count) + htmlWords.get(i + count++));
|
---|
129 | }
|
---|
130 | else
|
---|
131 | {
|
---|
132 | break;
|
---|
133 | }
|
---|
134 | }
|
---|
135 |
|
---|
136 | // If there was a place name found then find its information and score it
|
---|
137 | if (lastGoodPlaceName != null)
|
---|
138 | {
|
---|
139 | placeNameIndexesAndLength.put(i, count);
|
---|
140 |
|
---|
141 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
142 |
|
---|
143 | if (placeList == null)
|
---|
144 | {
|
---|
145 | continue;
|
---|
146 | }
|
---|
147 |
|
---|
148 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
149 | {
|
---|
150 | for (Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
151 | {
|
---|
152 | p.directReference();
|
---|
153 | }
|
---|
154 | }
|
---|
155 |
|
---|
156 | for (Place p : placeList)
|
---|
157 | {
|
---|
158 | p.directReference();
|
---|
159 | addScore(p, 256);
|
---|
160 | }
|
---|
161 | }
|
---|
162 |
|
---|
163 | currentPotentialPlaceNames = new StringBuilder();
|
---|
164 | }
|
---|
165 | }
|
---|
166 | catch (Exception ex)
|
---|
167 | {
|
---|
168 | ex.printStackTrace();
|
---|
169 | }
|
---|
170 |
|
---|
171 |
|
---|
172 | // Tidy up the places found by removing overlapping places
|
---|
173 | for(int i = 0; i < htmlWords.size(); i++)
|
---|
174 | {
|
---|
175 | if(placeNameIndexesAndLength.get(i) != null)
|
---|
176 | {
|
---|
177 | int length = placeNameIndexesAndLength.get(i);
|
---|
178 | for(int j = 1; j < length; j++)
|
---|
179 | {
|
---|
180 | if(placeNameIndexesAndLength.get(i + j) != null)
|
---|
181 | {
|
---|
182 | placeNameIndexesAndLength.remove(i + j);
|
---|
183 | }
|
---|
184 | }
|
---|
185 | }
|
---|
186 | }
|
---|
187 |
|
---|
188 | StringBuilder htmlBuilder = new StringBuilder();
|
---|
189 |
|
---|
190 | int placeEnd = -1;
|
---|
191 | for(int i = 0; i < htmlWords.size(); i++)
|
---|
192 | {
|
---|
193 | if(i == placeEnd)
|
---|
194 | {
|
---|
195 | htmlBuilder.append("</span>");
|
---|
196 | }
|
---|
197 |
|
---|
198 | if(i < betweenChars.size() && betweenChars.get(i) != null)
|
---|
199 | {
|
---|
200 | htmlBuilder.append(betweenChars.get(i));
|
---|
201 | }
|
---|
202 |
|
---|
203 | if(placeNameIndexesAndLength.get(i) != null)
|
---|
204 | {
|
---|
205 | htmlBuilder.append("<span class=\"place\">");
|
---|
206 | placeEnd = i + (placeNameIndexesAndLength.get(i) - 1);
|
---|
207 | }
|
---|
208 | htmlBuilder.append(htmlWords.get(i));
|
---|
209 | }
|
---|
210 |
|
---|
211 | _markedUpText = htmlBuilder.toString();
|
---|
212 |
|
---|
213 | if (_places.size() > 0)
|
---|
214 | {
|
---|
215 | adjustScores();
|
---|
216 | }
|
---|
217 |
|
---|
218 | return _places;
|
---|
219 | }
|
---|
220 |
|
---|
221 | public ArrayList<Place> examineTextWithoutGate(ArrayList<ArrayList<String>> lines)
|
---|
222 | {
|
---|
223 | _places.clear();
|
---|
224 | try
|
---|
225 | {
|
---|
226 | // Stores the current words being examined
|
---|
227 | StringBuilder currentWords = new StringBuilder();
|
---|
228 |
|
---|
229 | // Read each line from the file
|
---|
230 | for (ArrayList<String> words : lines)
|
---|
231 | {
|
---|
232 | // Examine the words
|
---|
233 | for (int j = 0; j < words.size(); j++)
|
---|
234 | {
|
---|
235 | // If the word does not begin with an uppercase letter then
|
---|
236 | // ignore it
|
---|
237 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
238 | {
|
---|
239 | continue;
|
---|
240 | }
|
---|
241 |
|
---|
242 | // Used to store a good place name
|
---|
243 | String lastGoodPlaceName = null;
|
---|
244 |
|
---|
245 | // Add the first word to the list of words to be examined
|
---|
246 | currentWords.append(words.get(j));
|
---|
247 |
|
---|
248 | // While the gazetteer does not reach a dead end, add
|
---|
249 | // another word and examine them
|
---|
250 | int count = 1;
|
---|
251 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
252 | {
|
---|
253 | // If the words are a place name then store it
|
---|
254 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
255 | {
|
---|
256 | lastGoodPlaceName = currentWords.toString();
|
---|
257 | // System.out.println("Current place name part => "
|
---|
258 | // + lastGoodPlaceName);
|
---|
259 | }
|
---|
260 |
|
---|
261 | // If it is not the end of the words the add the next
|
---|
262 | // word
|
---|
263 | if (j + count < words.size())
|
---|
264 | {
|
---|
265 | currentWords.append(" " + words.get(j + count++));
|
---|
266 | }
|
---|
267 | else
|
---|
268 | {
|
---|
269 | break;
|
---|
270 | }
|
---|
271 | }
|
---|
272 |
|
---|
273 | // If there was a place name found then find its information
|
---|
274 | // and score it
|
---|
275 | if (lastGoodPlaceName != null)
|
---|
276 | {
|
---|
277 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
278 |
|
---|
279 | if (placeList == null)
|
---|
280 | {
|
---|
281 | continue;
|
---|
282 | }
|
---|
283 |
|
---|
284 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
285 | {
|
---|
286 | for (Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
287 | {
|
---|
288 | p.directReference();
|
---|
289 | }
|
---|
290 | }
|
---|
291 |
|
---|
292 | for (Place p : placeList)
|
---|
293 | {
|
---|
294 | p.directReference();
|
---|
295 | addScore(p, 256);
|
---|
296 | }
|
---|
297 | }
|
---|
298 | currentWords = new StringBuilder();
|
---|
299 | }
|
---|
300 | }
|
---|
301 | }
|
---|
302 | catch (Exception ex)
|
---|
303 | {
|
---|
304 | ex.printStackTrace();
|
---|
305 | }
|
---|
306 |
|
---|
307 | adjustScores();
|
---|
308 |
|
---|
309 | return _places;
|
---|
310 | }
|
---|
311 |
|
---|
312 | public ArrayList<String> getPlaceNames(String text)
|
---|
313 | {
|
---|
314 | ArrayList<String> placeNames = new ArrayList<String>();
|
---|
315 | _places.clear();
|
---|
316 | try
|
---|
317 | {
|
---|
318 | // The file to read from
|
---|
319 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
320 |
|
---|
321 | // Stores the current words being examined
|
---|
322 | StringBuilder currentWords = new StringBuilder();
|
---|
323 |
|
---|
324 | // Stores the current line being examined
|
---|
325 | String currentLine = null;
|
---|
326 |
|
---|
327 | // The list of words in the line
|
---|
328 | ArrayList<String> words = new ArrayList<String>();
|
---|
329 |
|
---|
330 | // Read each line from the file
|
---|
331 | while ((currentLine = file.readLine()) != null)
|
---|
332 | {
|
---|
333 | words.clear();
|
---|
334 |
|
---|
335 | // Find the words in those lines and add them to the list
|
---|
336 | words.addAll(MarkupService.findWords(currentLine));
|
---|
337 |
|
---|
338 | // Examine the words
|
---|
339 | for (int j = 0; j < words.size(); j++)
|
---|
340 | {
|
---|
341 | // If the word does not begin with an uppercase letter then
|
---|
342 | // ignore it
|
---|
343 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
344 | {
|
---|
345 | continue;
|
---|
346 | }
|
---|
347 |
|
---|
348 | // Used to store a good place name
|
---|
349 | String lastGoodPlaceName = null;
|
---|
350 |
|
---|
351 | // Add the first word to the list of words to be examined
|
---|
352 | currentWords.append(words.get(j));
|
---|
353 |
|
---|
354 | // While the gazetteer does not reach a dead end, add
|
---|
355 | // another word and examine them
|
---|
356 | int count = 1;
|
---|
357 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
358 | {
|
---|
359 | // If the words are a place name then store it
|
---|
360 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
361 | {
|
---|
362 | lastGoodPlaceName = currentWords.toString();
|
---|
363 | // System.out.println("Current place name part => "
|
---|
364 | // + lastGoodPlaceName);
|
---|
365 | }
|
---|
366 |
|
---|
367 | // If it is not the end of the words the add the next
|
---|
368 | // word
|
---|
369 | if (j + count < words.size())
|
---|
370 | {
|
---|
371 | currentWords.append(" " + words.get(j + count++));
|
---|
372 | }
|
---|
373 | else
|
---|
374 | {
|
---|
375 | break;
|
---|
376 | }
|
---|
377 | }
|
---|
378 |
|
---|
379 | // If there was a place name found then find its information
|
---|
380 | // and score it
|
---|
381 | if (lastGoodPlaceName != null)
|
---|
382 | {
|
---|
383 | placeNames.add(lastGoodPlaceName);
|
---|
384 | }
|
---|
385 | currentWords = new StringBuilder();
|
---|
386 | }
|
---|
387 | }
|
---|
388 | file.close();
|
---|
389 | }
|
---|
390 | catch (Exception ex)
|
---|
391 | {
|
---|
392 | ex.printStackTrace();
|
---|
393 | }
|
---|
394 |
|
---|
395 | return placeNames;
|
---|
396 | }
|
---|
397 |
|
---|
398 | public ArrayList<Place> examineArrayOfStrings(ArrayList<String> placeNames)
|
---|
399 | {
|
---|
400 | _places.clear();
|
---|
401 | for (int i = 0; i < placeNames.size(); i++)
|
---|
402 | {
|
---|
403 | String currentPlaceName = placeNames.get(i);
|
---|
404 |
|
---|
405 | ArrayList<Place> placeList = null;
|
---|
406 | if (_placeCache.containsKey(currentPlaceName))
|
---|
407 | {
|
---|
408 | placeList = _placeCache.get(currentPlaceName);
|
---|
409 | for (Place p : placeList)
|
---|
410 | {
|
---|
411 | p.unDirectReference();
|
---|
412 | }
|
---|
413 | }
|
---|
414 | else
|
---|
415 | {
|
---|
416 | placeList = PlaceInformation.getPlaces(currentPlaceName);
|
---|
417 | _placeCache.put(currentPlaceName, placeList);
|
---|
418 | }
|
---|
419 |
|
---|
420 | if (placeList == null)
|
---|
421 | {
|
---|
422 | continue;
|
---|
423 | }
|
---|
424 |
|
---|
425 | if (_placeNameMap.containsKey(currentPlaceName))
|
---|
426 | {
|
---|
427 | for (Place p : _placeNameMap.get(currentPlaceName))
|
---|
428 | {
|
---|
429 | p.directReference();
|
---|
430 | }
|
---|
431 | }
|
---|
432 |
|
---|
433 | for (Place p : placeList)
|
---|
434 | {
|
---|
435 | p.directReference();
|
---|
436 | addScore(p, 256);
|
---|
437 | }
|
---|
438 | }
|
---|
439 |
|
---|
440 | // System.out.println("Done!");
|
---|
441 | // System.out.print("Adjusting Scores... ");
|
---|
442 |
|
---|
443 | adjustScores();
|
---|
444 |
|
---|
445 | // System.out.println("Done!");
|
---|
446 | // System.out.print("Sorting Scores... ");
|
---|
447 |
|
---|
448 | // System.out.println("Done!");
|
---|
449 |
|
---|
450 | // System.out.println("Places found = " + _places);
|
---|
451 |
|
---|
452 | return _places;
|
---|
453 | }
|
---|
454 |
|
---|
455 | public ArrayList<Place> examineTextWithoutGate(String text)
|
---|
456 | {
|
---|
457 | _places.clear();
|
---|
458 | try
|
---|
459 | {
|
---|
460 | // The file to read from
|
---|
461 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
462 |
|
---|
463 | // Stores the current words being examined
|
---|
464 | StringBuilder currentWords = new StringBuilder();
|
---|
465 |
|
---|
466 | // Stores the current line being examined
|
---|
467 | String currentLine = null;
|
---|
468 |
|
---|
469 | // The list of words in the line
|
---|
470 | ArrayList<String> words = new ArrayList<String>();
|
---|
471 |
|
---|
472 | // Read each line from the file
|
---|
473 | while ((currentLine = file.readLine()) != null)
|
---|
474 | {
|
---|
475 | words.clear();
|
---|
476 |
|
---|
477 | // Find the words in those lines and add them to the list
|
---|
478 | words.addAll(MarkupService.findWords(currentLine));
|
---|
479 |
|
---|
480 | // Examine the words
|
---|
481 | for (int j = 0; j < words.size(); j++)
|
---|
482 | {
|
---|
483 | // If the word does not begin with an uppercase letter then
|
---|
484 | // ignore it
|
---|
485 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
486 | {
|
---|
487 | continue;
|
---|
488 | }
|
---|
489 |
|
---|
490 | // Used to store a good place name
|
---|
491 | String lastGoodPlaceName = null;
|
---|
492 |
|
---|
493 | // Add the first word to the list of words to be examined
|
---|
494 | currentWords.append(words.get(j));
|
---|
495 |
|
---|
496 | // While the gazetteer does not reach a dead end, add
|
---|
497 | // another word and examine them
|
---|
498 | int count = 1;
|
---|
499 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
500 | {
|
---|
501 | // If the words are a place name then store it
|
---|
502 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
503 | {
|
---|
504 | lastGoodPlaceName = currentWords.toString();
|
---|
505 | // System.out.println("Current place name part => "
|
---|
506 | // + lastGoodPlaceName);
|
---|
507 | }
|
---|
508 |
|
---|
509 | // If it is not the end of the words the add the next
|
---|
510 | // word
|
---|
511 | if (j + count < words.size())
|
---|
512 | {
|
---|
513 | currentWords.append(" " + words.get(j + count++));
|
---|
514 | }
|
---|
515 | else
|
---|
516 | {
|
---|
517 | break;
|
---|
518 | }
|
---|
519 | }
|
---|
520 |
|
---|
521 | // If there was a place name found then find its information
|
---|
522 | // and score it
|
---|
523 | if (lastGoodPlaceName != null)
|
---|
524 | {
|
---|
525 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
526 |
|
---|
527 | if (placeList == null)
|
---|
528 | {
|
---|
529 | continue;
|
---|
530 | }
|
---|
531 |
|
---|
532 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
533 | {
|
---|
534 | for (Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
535 | {
|
---|
536 | p.directReference();
|
---|
537 | }
|
---|
538 | }
|
---|
539 |
|
---|
540 | for (Place p : placeList)
|
---|
541 | {
|
---|
542 | p.directReference();
|
---|
543 | addScore(p, 256);
|
---|
544 | }
|
---|
545 | }
|
---|
546 | currentWords = new StringBuilder();
|
---|
547 | }
|
---|
548 | }
|
---|
549 | file.close();
|
---|
550 | }
|
---|
551 | catch (Exception ex)
|
---|
552 | {
|
---|
553 | ex.printStackTrace();
|
---|
554 | }
|
---|
555 |
|
---|
556 | // System.out.println("Done!");
|
---|
557 | // System.out.print("Adjusting Scores... ");
|
---|
558 |
|
---|
559 | adjustScores();
|
---|
560 |
|
---|
561 | // System.out.println("Done!");
|
---|
562 | // System.out.print("Sorting Scores... ");
|
---|
563 | // System.out.println("Done!");
|
---|
564 | // System.out.println("Places found = " + _places);
|
---|
565 |
|
---|
566 | return _places;
|
---|
567 | }
|
---|
568 |
|
---|
569 | public String getMarkedUpText()
|
---|
570 | {
|
---|
571 | return _markedUpText;
|
---|
572 | }
|
---|
573 |
|
---|
574 | public Place getTopScorePlace()
|
---|
575 | {
|
---|
576 | Place p = _places.get(0);
|
---|
577 |
|
---|
578 | for (Place pp : _places)
|
---|
579 | {
|
---|
580 | if (pp.getScore() > p.getScore())
|
---|
581 | {
|
---|
582 | p = pp;
|
---|
583 | }
|
---|
584 | }
|
---|
585 | return p;
|
---|
586 | }
|
---|
587 |
|
---|
588 | /**
|
---|
589 | * Used to make the original place scores more accurate by using other place
|
---|
590 | * information
|
---|
591 | */
|
---|
592 | public void adjustScores()
|
---|
593 | {
|
---|
594 | for (Place p : _places)
|
---|
595 | {
|
---|
596 | if (!p.isDirectlyReferenced())
|
---|
597 | {
|
---|
598 | p.setScore((int) (p.getScore() * _penaltyPercentage));
|
---|
599 | }
|
---|
600 | }
|
---|
601 |
|
---|
602 | Place topScore = getTopScorePlace();
|
---|
603 |
|
---|
604 | for (Place p : _places)
|
---|
605 | {
|
---|
606 | if (p.getParentPlaceName() == null)
|
---|
607 | {
|
---|
608 | continue;
|
---|
609 | }
|
---|
610 |
|
---|
611 | for (Place pp : _places)
|
---|
612 | {
|
---|
613 | if (p.isIn(pp) && (topScore.getScore() - pp.getScore()) <= (int) (topScore.getScore() * 0.1 * _parentLimitPercentage))
|
---|
614 | {
|
---|
615 | p.setScore((int) (p.getScore() + (pp.getScore() * _parentBonusPercentage)));
|
---|
616 | }
|
---|
617 | }
|
---|
618 | }
|
---|
619 | }
|
---|
620 |
|
---|
621 | /**
|
---|
622 | * Adds one to the score of the given place
|
---|
623 | *
|
---|
624 | * @param p
|
---|
625 | * is the place to add one to the score of
|
---|
626 | */
|
---|
627 | public void addScore(Place p, Integer scoreToAdd)
|
---|
628 | {
|
---|
629 | if (p == null)
|
---|
630 | {
|
---|
631 | return;
|
---|
632 | }
|
---|
633 |
|
---|
634 | // If there is already a score for this key the increase it by one
|
---|
635 | if (_places.contains(p))
|
---|
636 | {
|
---|
637 | Place place = _places.get(_places.indexOf(p));
|
---|
638 | place.setScore(place.getScore() + scoreToAdd);
|
---|
639 | }
|
---|
640 | // If there is no score for this key then make one
|
---|
641 | else
|
---|
642 | {
|
---|
643 | p.setScore(scoreToAdd);
|
---|
644 | _places.add(p);
|
---|
645 |
|
---|
646 | if (_placeNameMap.containsKey(p.getName()))
|
---|
647 | {
|
---|
648 | _placeNameMap.get(p.getName()).add(p);
|
---|
649 | }
|
---|
650 | else
|
---|
651 | {
|
---|
652 | ArrayList<Place> placeList = new ArrayList<Place>();
|
---|
653 | _placeNameMap.put(p.getName(), placeList);
|
---|
654 | }
|
---|
655 | }
|
---|
656 |
|
---|
657 | // Add to the parent's score too (if there is one)
|
---|
658 | if (!(p.getParentPlaceName() == null))
|
---|
659 | {
|
---|
660 | // If there is an ancestor then add to it's score as well
|
---|
661 | if (p.getParentPlaceName().contains(", "))
|
---|
662 | {
|
---|
663 | String[] places = p.getParentPlaceName().split(", ");
|
---|
664 |
|
---|
665 | ArrayList<Place> specificPlaces = null;
|
---|
666 |
|
---|
667 | if (places.length == 3)
|
---|
668 | {
|
---|
669 | specificPlaces = PlaceInformation.getSpecificPlace(places[0], places[1] + ", " + places[2]);
|
---|
670 | }
|
---|
671 | else
|
---|
672 | {
|
---|
673 | specificPlaces = PlaceInformation.getSpecificPlace(places[0], places[1]);
|
---|
674 | }
|
---|
675 |
|
---|
676 | if (specificPlaces == null)
|
---|
677 | {
|
---|
678 | return;
|
---|
679 | }
|
---|
680 |
|
---|
681 | for (Place pp : specificPlaces)
|
---|
682 | {
|
---|
683 | addScore(pp, (int) (scoreToAdd * _indirectReferencePenaltyPercentage));
|
---|
684 | }
|
---|
685 | }
|
---|
686 | // Otherwise just add the parent
|
---|
687 | else
|
---|
688 | {
|
---|
689 | ArrayList<Place> specificPlaces = PlaceInformation.getSpecificPlace(p.getParentPlaceName(), null);
|
---|
690 |
|
---|
691 | if (specificPlaces == null)
|
---|
692 | {
|
---|
693 | return;
|
---|
694 | }
|
---|
695 |
|
---|
696 | for (Place pp : specificPlaces)
|
---|
697 | {
|
---|
698 | addScore(pp, (int) (scoreToAdd * _indirectReferencePenaltyPercentage));
|
---|
699 | }
|
---|
700 | }
|
---|
701 | }
|
---|
702 | }
|
---|
703 |
|
---|
704 | public void sortScores()
|
---|
705 | {
|
---|
706 | ArrayList<Place> sortedPlaces = new ArrayList<Place>();
|
---|
707 |
|
---|
708 | while (_places.size() > 0)
|
---|
709 | {
|
---|
710 | int index = -1;
|
---|
711 | for (int j = 0; j < _places.size(); j++)
|
---|
712 | {
|
---|
713 | if (index == -1 || _places.get(j).getScore() > _places.get(index).getScore())
|
---|
714 | {
|
---|
715 | index = j;
|
---|
716 | }
|
---|
717 | }
|
---|
718 |
|
---|
719 | sortedPlaces.add(_places.remove(index));
|
---|
720 | }
|
---|
721 |
|
---|
722 | _places = sortedPlaces;
|
---|
723 | }
|
---|
724 |
|
---|
725 | public ArrayList<Place> getPlacesWithParams(double maxScorePercentage, double minScorePercentage, long minPopulation, boolean locality, boolean region, boolean country, int numberOfPlacesToGet)
|
---|
726 | {
|
---|
727 | // System.out.println("Getting places with params");
|
---|
728 |
|
---|
729 | int topScore = _places.get(0).getScore();
|
---|
730 | int minScore = (int) (topScore * minScorePercentage);
|
---|
731 | int maxScore = (int) (topScore * maxScorePercentage);
|
---|
732 |
|
---|
733 | // System.out.println("minScore = " + minScore);
|
---|
734 | // System.out.println("maxScore = " + maxScore);
|
---|
735 |
|
---|
736 | ArrayList<Place> matchingPlaces = new ArrayList<Place>();
|
---|
737 |
|
---|
738 | // Go through all the markers
|
---|
739 | for (Place p : _places)
|
---|
740 | {
|
---|
741 | // System.out.println("Testing place -> " + p.getName());
|
---|
742 | if (p == null)
|
---|
743 | {
|
---|
744 | // System.out.println("P is null?");
|
---|
745 | continue;
|
---|
746 | }
|
---|
747 |
|
---|
748 | // If the place meets the criteria
|
---|
749 | if (p.getScore() > minScore && p.getScore() <= maxScore && p.getPopulation() > minPopulation && ((locality && p.getPlaceType().equals("locality")) || (region && p.getPlaceType().equals("region")) || (country && p.getPlaceType().equals("country"))))
|
---|
750 | {
|
---|
751 | // System.out.println("MATCH!");
|
---|
752 | // If there is not already the maximum amount of visible places
|
---|
753 | // then add this
|
---|
754 | if (matchingPlaces.size() < numberOfPlacesToGet)
|
---|
755 | {
|
---|
756 | matchingPlaces.add(p);
|
---|
757 | }
|
---|
758 | // If there is already MAXVISIBLE visible places then see if
|
---|
759 | // this place should replace one
|
---|
760 | else
|
---|
761 | {
|
---|
762 | Place minScorePlace = null;
|
---|
763 | for (Place q : matchingPlaces)
|
---|
764 | {
|
---|
765 | if (minScorePlace == null)
|
---|
766 | {
|
---|
767 | minScorePlace = q;
|
---|
768 | }
|
---|
769 | if (q.getScore() < minScorePlace.getScore())
|
---|
770 | {
|
---|
771 | minScorePlace = q;
|
---|
772 | }
|
---|
773 | }
|
---|
774 |
|
---|
775 | if (p.getScore() > minScorePlace.getScore())
|
---|
776 | {
|
---|
777 | matchingPlaces.remove(minScorePlace);
|
---|
778 | matchingPlaces.add(p);
|
---|
779 | }
|
---|
780 | }
|
---|
781 | }
|
---|
782 | else
|
---|
783 | {
|
---|
784 | // System.out.println("NOT A MATCH");
|
---|
785 | }
|
---|
786 | }
|
---|
787 | return matchingPlaces;
|
---|
788 | }
|
---|
789 |
|
---|
790 | public void setScoringParams(double penaltyPercentage, double parentBonusPercentage, double indirectReferencePenaltyPercentage)
|
---|
791 | {
|
---|
792 | _penaltyPercentage = penaltyPercentage;
|
---|
793 | _parentBonusPercentage = parentBonusPercentage;
|
---|
794 | _indirectReferencePenaltyPercentage = indirectReferencePenaltyPercentage;
|
---|
795 | }
|
---|
796 |
|
---|
797 | public void setScoringParams(ScanConfiguration config)
|
---|
798 | {
|
---|
799 | _penaltyPercentage = config.getPenalty();
|
---|
800 | _parentBonusPercentage = config.getParentBonus();
|
---|
801 | _indirectReferencePenaltyPercentage = config.getIndirectReferencePenalty();
|
---|
802 | _parentLimitPercentage = config.getParentLimit();
|
---|
803 | }
|
---|
804 |
|
---|
805 | // public HashMap<String, Integer> wordCount(String fileName)
|
---|
806 | // {
|
---|
807 | // HashMap<String, Integer> wordCountMap = new HashMap<String, Integer>();
|
---|
808 | //
|
---|
809 | // try
|
---|
810 | // {
|
---|
811 | // BufferedReader file = new BufferedReader(new FileReader(fileName));
|
---|
812 | //
|
---|
813 | // StringBuilder currentWord = new StringBuilder();
|
---|
814 | //
|
---|
815 | // String line = "";
|
---|
816 | //
|
---|
817 | // ArrayList<String> words = new ArrayList<String>();
|
---|
818 | //
|
---|
819 | // //System.out.print("Finding words... ");
|
---|
820 | // while((line = file.readLine()) != null)
|
---|
821 | // {
|
---|
822 | // words.addAll(MarkupService.findWords(line));
|
---|
823 | // }
|
---|
824 | // //System.out.println("Done!");
|
---|
825 | //
|
---|
826 | // //System.out.print("Adding up scores... ");
|
---|
827 | // for(int j = 0; j < words.size(); j++)
|
---|
828 | // {
|
---|
829 | // if(Character.isLowerCase(words.get(j).charAt(0)))
|
---|
830 | // {
|
---|
831 | // //continue;
|
---|
832 | // }
|
---|
833 | //
|
---|
834 | // currentWord.append(words.get(j));
|
---|
835 | //
|
---|
836 | // int count = 1;
|
---|
837 | // while(_gazetteer.checkPlaceName(currentWord.toString()) != -1)
|
---|
838 | // {
|
---|
839 | // if(_gazetteer.checkPlaceName(currentWord.toString()) == 1)
|
---|
840 | // {
|
---|
841 | // if(wordCountMap.containsKey(currentWord.toString()))
|
---|
842 | // {
|
---|
843 | // Integer i = wordCountMap.get(currentWord.toString());
|
---|
844 | // wordCountMap.put(currentWord.toString(), ++i);
|
---|
845 | // }
|
---|
846 | // else
|
---|
847 | // {
|
---|
848 | // wordCountMap.put(currentWord.toString(), 1);
|
---|
849 | // }
|
---|
850 | // }
|
---|
851 | // currentWord.append(" " + words.get(j + count++));
|
---|
852 | // }
|
---|
853 | //
|
---|
854 | // currentWord.delete(0, currentWord.length());
|
---|
855 | // }
|
---|
856 | // }
|
---|
857 | // catch(Exception ex)
|
---|
858 | // {
|
---|
859 | // ex.printStackTrace();
|
---|
860 | // }
|
---|
861 | //
|
---|
862 | // return wordCountMap;
|
---|
863 | // }
|
---|
864 |
|
---|
865 | public boolean isGazetteerLoaded()
|
---|
866 | {
|
---|
867 | return _gazetteer != null;
|
---|
868 | }
|
---|
869 |
|
---|
870 | public ArrayList<ArrayList<Place>> examineMultipleTexts(String[] texts)
|
---|
871 | {
|
---|
872 | ArrayList<ArrayList<Place>> multipleResults = new ArrayList<ArrayList<Place>>();
|
---|
873 | for (String text : texts)
|
---|
874 | {
|
---|
875 | if (text != null)
|
---|
876 | {
|
---|
877 | this.examineTextWithGate(null, text);
|
---|
878 |
|
---|
879 | if (_places.size() > 0)
|
---|
880 | {
|
---|
881 | multipleResults.add(new ArrayList<Place>(_places));
|
---|
882 | }
|
---|
883 | else
|
---|
884 | {
|
---|
885 | multipleResults.add(null);
|
---|
886 | }
|
---|
887 | }
|
---|
888 | else
|
---|
889 | {
|
---|
890 | multipleResults.add(null);
|
---|
891 | }
|
---|
892 | }
|
---|
893 |
|
---|
894 | return multipleResults;
|
---|
895 | }
|
---|
896 |
|
---|
897 | public ArrayList<Place> getPlaces()
|
---|
898 | {
|
---|
899 | return _places;
|
---|
900 | }
|
---|
901 |
|
---|
902 | public void clearPlaces()
|
---|
903 | {
|
---|
904 | _places.clear();
|
---|
905 | }
|
---|
906 | }
|
---|
907 |
|
---|
908 | /*
|
---|
909 | *
|
---|
910 | * // Halve the scores for places that do not have their parents // mentioned //
|
---|
911 | * ********************************************************************
|
---|
912 | *
|
---|
913 | * String[] parentPlaceNames = p.getParentPlaceName().split(", "); if
|
---|
914 | * (parentPlaceNames.length == 1) { ArrayList<Place> parentList =
|
---|
915 | * PlaceInformation.getSpecificPlace(parentPlaceNames[0], null); Place parent =
|
---|
916 | * null;
|
---|
917 | *
|
---|
918 | * if (parentList != null && parentList.size() > 0) { parent =
|
---|
919 | * parentList.get(0);
|
---|
920 | *
|
---|
921 | * if (!_places.contains(parent) ||
|
---|
922 | * !_places.get(_places.indexOf(parent)).isDirectlyReferenced()) {
|
---|
923 | * p.setScore((int) (p.getScore() * (1 - _penaltyPercentage))); } } } else if
|
---|
924 | * (parentPlaceNames.length == 2) { ArrayList<Place> parentList =
|
---|
925 | * PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1]);
|
---|
926 | * ArrayList<Place> ancestorList =
|
---|
927 | * PlaceInformation.getSpecificPlace(parentPlaceNames[1], null);
|
---|
928 | *
|
---|
929 | * Place parent = null; Place ancestor = null;
|
---|
930 | *
|
---|
931 | * if (parentList != null && parentList.size() > 0) { parent =
|
---|
932 | * parentList.get(0);
|
---|
933 | *
|
---|
934 | * if (!_places.contains(parent) ||
|
---|
935 | * !_places.get(_places.indexOf(parent)).isDirectlyReferenced()) {
|
---|
936 | * p.setScore((int) (p.getScore() * (1 - _penaltyPercentage))); } }
|
---|
937 | *
|
---|
938 | * if (ancestorList != null && ancestorList.size() > 0) { ancestor =
|
---|
939 | * ancestorList.get(0);
|
---|
940 | *
|
---|
941 | * if (!_places.contains(ancestor) ||
|
---|
942 | * !_places.get(_places.indexOf(ancestor)).isDirectlyReferenced()) {
|
---|
943 | * p.setScore((int) (p.getScore() * (1 - _penaltyPercentage))); } } } else if
|
---|
944 | * (parentPlaceNames.length == 3) { ArrayList<Place> parentList =
|
---|
945 | * PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1] +
|
---|
946 | * ", " + parentPlaceNames[2]); ArrayList<Place> firstAncestorList =
|
---|
947 | * PlaceInformation.getSpecificPlace(parentPlaceNames[1], parentPlaceNames[2]);
|
---|
948 | * ArrayList<Place> secondAncestorList =
|
---|
949 | * PlaceInformation.getSpecificPlace(parentPlaceNames[2], null);
|
---|
950 | *
|
---|
951 | * Place parent = null; Place firstAncestor = null; Place secondAncestor = null;
|
---|
952 | *
|
---|
953 | * if (parentList != null && parentList.size() > 0) { parent =
|
---|
954 | * parentList.get(0);
|
---|
955 | *
|
---|
956 | * if (!_places.contains(parent) ||
|
---|
957 | * !_places.get(_places.indexOf(parent)).isDirectlyReferenced()) {
|
---|
958 | * p.setScore((int) (p.getScore() * (1 - _penaltyPercentage))); } }
|
---|
959 | *
|
---|
960 | * if (firstAncestorList != null && firstAncestorList.size() > 0) {
|
---|
961 | * firstAncestor = firstAncestorList.get(0);
|
---|
962 | *
|
---|
963 | * if (!_places.contains(firstAncestor) ||
|
---|
964 | * !_places.get(_places.indexOf(firstAncestor)).isDirectlyReferenced()) {
|
---|
965 | * p.setScore((int) (p.getScore() * (1 - _penaltyPercentage))); } }
|
---|
966 | *
|
---|
967 | * if (secondAncestorList != null && secondAncestorList.size() > 0) {
|
---|
968 | * secondAncestor = secondAncestorList.get(0);
|
---|
969 | *
|
---|
970 | * if (!_places.contains(secondAncestor) ||
|
---|
971 | * !_places.get(_places.indexOf(secondAncestor)).isDirectlyReferenced()) {
|
---|
972 | * p.setScore((int) (p.getScore() * (1 - _penaltyPercentage))); } } }
|
---|
973 | */
|
---|
974 | // Add part of the parent's score to the child
|
---|
975 | // *******************************************
|
---|
976 | /*
|
---|
977 | * if (parentPlaceNames.length == 1) { ArrayList<Place> parentList =
|
---|
978 | * PlaceInformation.getSpecificPlace(parentPlaceNames[0], null);
|
---|
979 | *
|
---|
980 | * Place parent = null;
|
---|
981 | *
|
---|
982 | * if (parentList != null && parentList.size() > 0) { parent =
|
---|
983 | * parentList.get(0);
|
---|
984 | *
|
---|
985 | * if (_places.contains(parent)) { p.setScore(p.getScore() + (int)
|
---|
986 | * (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
987 | * } } } else if (parentPlaceNames.length == 2) { ArrayList<Place> parentList =
|
---|
988 | * PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1]);
|
---|
989 | * ArrayList<Place> ancestorList =
|
---|
990 | * PlaceInformation.getSpecificPlace(parentPlaceNames[1], null);
|
---|
991 | *
|
---|
992 | * Place parent = null; Place ancestor = null;
|
---|
993 | *
|
---|
994 | * if (parentList != null && parentList.size() > 0) { parent =
|
---|
995 | * parentList.get(0);
|
---|
996 | *
|
---|
997 | * if (_places.contains(parent)) { p.setScore(p.getScore() + (int)
|
---|
998 | * (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
999 | * } }
|
---|
1000 | *
|
---|
1001 | * if (ancestorList != null && ancestorList.size() > 0) { ancestor =
|
---|
1002 | * ancestorList.get(0);
|
---|
1003 | *
|
---|
1004 | * if (_places.contains(ancestor)) { p.setScore(p.getScore() + (int)
|
---|
1005 | * (_places.get(_places.indexOf(ancestor)).getScore() *
|
---|
1006 | * _parentBonusPercentage)); } } } else if (parentPlaceNames.length == 3) {
|
---|
1007 | * ArrayList<Place> parentList =
|
---|
1008 | * PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1] +
|
---|
1009 | * ", " + parentPlaceNames[2]); ArrayList<Place> firstAncestorList =
|
---|
1010 | * PlaceInformation.getSpecificPlace(parentPlaceNames[1], parentPlaceNames[2]);
|
---|
1011 | * ArrayList<Place> secondAncestorList =
|
---|
1012 | * PlaceInformation.getSpecificPlace(parentPlaceNames[2], null);
|
---|
1013 | *
|
---|
1014 | * Place parent = null; Place firstAncestor = null; Place secondAncestor = null;
|
---|
1015 | *
|
---|
1016 | * if (parentList != null && parentList.size() > 0) { parent =
|
---|
1017 | * parentList.get(0);
|
---|
1018 | *
|
---|
1019 | * if (_places.contains(parent)) { p.setScore(p.getScore() + (int)
|
---|
1020 | * (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
1021 | * } }
|
---|
1022 | *
|
---|
1023 | * if (firstAncestorList != null && firstAncestorList.size() > 0) {
|
---|
1024 | * firstAncestor = firstAncestorList.get(0);
|
---|
1025 | *
|
---|
1026 | * if (_places.contains(firstAncestor)) { p.setScore(p.getScore() + (int)
|
---|
1027 | * (_places.get(_places.indexOf(firstAncestor)).getScore() *
|
---|
1028 | * _parentBonusPercentage)); } }
|
---|
1029 | *
|
---|
1030 | * if (secondAncestorList != null && secondAncestorList.size() > 0) {
|
---|
1031 | * secondAncestor = secondAncestorList.get(0);
|
---|
1032 | *
|
---|
1033 | * if (_places.contains(secondAncestor)) { p.setScore(p.getScore() + (int)
|
---|
1034 | * (_places.get(_places.indexOf(secondAncestor)).getScore() *
|
---|
1035 | * _parentBonusPercentage)); } } }
|
---|
1036 | */ |
---|