1 | package org.greenstone.server;
|
---|
2 |
|
---|
3 | import java.io.BufferedReader;
|
---|
4 | import java.io.StringReader;
|
---|
5 | import java.util.ArrayList;
|
---|
6 | import java.util.HashMap;
|
---|
7 |
|
---|
8 | import org.greenstone.client.Place;
|
---|
9 |
|
---|
10 | public class PageScanner
|
---|
11 | {
|
---|
12 | // Stores the gazeteer structure (used to verify if words are a place name
|
---|
13 | // or not)
|
---|
14 | protected GazetteerTrieType2 _gazetteer = null;
|
---|
15 |
|
---|
16 | // Stores all of the places in the page that is being examined
|
---|
17 | protected ArrayList<Place> _places = new ArrayList<Place>();
|
---|
18 | protected HashMap<String, ArrayList<Place>> _placeNameMap = new HashMap<String, ArrayList<Place>>();
|
---|
19 | protected String _markedUpText = null;
|
---|
20 | protected HashMap<String, ArrayList<Place>> _placeCache = new HashMap<String, ArrayList<Place>>();
|
---|
21 |
|
---|
22 | protected GateScanner _gateScanner = new GateScanner();
|
---|
23 |
|
---|
24 | // Parameters for score calculations
|
---|
25 | // *********************************
|
---|
26 | protected double _penaltyPercentage = 0.5;
|
---|
27 | protected double _parentBonusPercentage = 0.25;
|
---|
28 | protected double _indirectReferencePenaltyPercentage = 0.25;
|
---|
29 | protected double _parentLimitPercentage = 0.05;
|
---|
30 |
|
---|
31 | protected ArrayList<String> _prevDoc = null;
|
---|
32 | protected String _prevFileName = null;
|
---|
33 |
|
---|
34 | /**
|
---|
35 | * Default constructer. It creates the place data structure and gazetteer
|
---|
36 | * trie structure
|
---|
37 | */
|
---|
38 | public PageScanner(String path)
|
---|
39 | {
|
---|
40 | System.out.println("Loading path = " + path);
|
---|
41 | try
|
---|
42 | {
|
---|
43 | PlaceInformation.init();
|
---|
44 | }
|
---|
45 | catch (Exception ex)
|
---|
46 | {
|
---|
47 | ex.printStackTrace();
|
---|
48 | }
|
---|
49 | System.out.println("Starting loading gazetteer");
|
---|
50 | _gazetteer = new GazetteerTrieType2(path + "/dataen.txt");
|
---|
51 | }
|
---|
52 |
|
---|
53 | /**
|
---|
54 | * Examines the given text to find place names and score them
|
---|
55 | *
|
---|
56 | * @param text
|
---|
57 | * is the text to examine
|
---|
58 | */
|
---|
59 | public ArrayList<Place> examineTextWithGate(String text, String htmlString)
|
---|
60 | {
|
---|
61 | System.out.println(htmlString);
|
---|
62 | // System.out.println("Examining text");
|
---|
63 |
|
---|
64 | StringBuilder html = null;
|
---|
65 | if (htmlString != null)
|
---|
66 | {
|
---|
67 | html = new StringBuilder(htmlString);
|
---|
68 | }
|
---|
69 |
|
---|
70 | _places.clear();
|
---|
71 | try
|
---|
72 | {
|
---|
73 | ArrayList<Word> classifiedWords = _gateScanner.classifyText(text);
|
---|
74 |
|
---|
75 | // The file to read from
|
---|
76 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
77 |
|
---|
78 | // Stores the current words being examined
|
---|
79 | StringBuilder currentWords = new StringBuilder();
|
---|
80 |
|
---|
81 | // Stores the current line being examined
|
---|
82 | String currentLine = null;
|
---|
83 |
|
---|
84 | int htmlIndex = 0;
|
---|
85 | int textIndex = 0;
|
---|
86 | int cwindex = 0;
|
---|
87 |
|
---|
88 | // The list of words in the line
|
---|
89 | ArrayList<String> words = new ArrayList<String>();
|
---|
90 |
|
---|
91 | // Read each line from the file
|
---|
92 | while ((currentLine = file.readLine()) != null)
|
---|
93 | {
|
---|
94 | words.clear();
|
---|
95 |
|
---|
96 | // Find the words in those lines and add them to the list
|
---|
97 | words.addAll(MarkupService.findWords(currentLine));
|
---|
98 |
|
---|
99 | // Examine the words
|
---|
100 | for (int j = 0; j < words.size(); j++)
|
---|
101 | {
|
---|
102 | int oldHTMLIndex = 0;
|
---|
103 | if (htmlString != null)
|
---|
104 | {
|
---|
105 | oldHTMLIndex = htmlIndex;
|
---|
106 | htmlIndex = html.indexOf(words.get(j), htmlIndex);
|
---|
107 | }
|
---|
108 | textIndex = text.indexOf(words.get(j), textIndex);
|
---|
109 | Word currentWord = null;
|
---|
110 | int cwindexBefore = cwindex;
|
---|
111 | for (; cwindex < classifiedWords.size(); cwindex++)
|
---|
112 | {
|
---|
113 | if (textIndex > 0 && text.charAt(textIndex - 1) == '\'')
|
---|
114 | {
|
---|
115 | break;
|
---|
116 | }
|
---|
117 |
|
---|
118 | //System.out.println("Looking for \"" + words.get(j) + "\" got \"" + classifiedWords.get(cwindex) + "\"");
|
---|
119 | if (classifiedWords.get(cwindex).getValue().equals(words.get(j)))
|
---|
120 | {
|
---|
121 | currentWord = classifiedWords.get(cwindex);
|
---|
122 | break;
|
---|
123 | }
|
---|
124 | }
|
---|
125 |
|
---|
126 | if (currentWord == null)
|
---|
127 | {
|
---|
128 | cwindex = cwindexBefore;
|
---|
129 | continue;
|
---|
130 | }
|
---|
131 |
|
---|
132 | // If the word does not begin with an uppercase letter then ignore it
|
---|
133 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))) || !(currentWord.getClassification().equals("NNP")))
|
---|
134 | {
|
---|
135 | continue;
|
---|
136 | }
|
---|
137 |
|
---|
138 | // Used to store a good place name
|
---|
139 | String lastGoodPlaceName = null;
|
---|
140 |
|
---|
141 | // Add the first word to the list of words to be examined
|
---|
142 | currentWords.append(words.get(j));
|
---|
143 |
|
---|
144 | // While the gazetteer does not reach a dead end, add
|
---|
145 | // another word and examine them
|
---|
146 | int count = 1;
|
---|
147 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
148 | {
|
---|
149 | // If the words are a place name then store it
|
---|
150 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
151 | {
|
---|
152 | lastGoodPlaceName = currentWords.toString();
|
---|
153 | }
|
---|
154 |
|
---|
155 | // If it is not the end of the words the add the next
|
---|
156 | // word
|
---|
157 | if (j + count < words.size())
|
---|
158 | {
|
---|
159 | currentWords.append(" " + words.get(j + count++));
|
---|
160 | }
|
---|
161 | else
|
---|
162 | {
|
---|
163 | break;
|
---|
164 | }
|
---|
165 | }
|
---|
166 |
|
---|
167 | // If there was a place name found then find its information
|
---|
168 | // and score it
|
---|
169 | if (lastGoodPlaceName != null)
|
---|
170 | {
|
---|
171 | if (htmlString != null)
|
---|
172 | {
|
---|
173 | if (htmlIndex == -1)
|
---|
174 | {
|
---|
175 | htmlIndex = oldHTMLIndex;
|
---|
176 | }
|
---|
177 | else
|
---|
178 | {
|
---|
179 | html.insert(htmlIndex, "<span class=\"place\">");
|
---|
180 | html.insert(htmlIndex + "<span class=\"place\">".length() + lastGoodPlaceName.length(), "</span>");
|
---|
181 | }
|
---|
182 | }
|
---|
183 |
|
---|
184 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
185 |
|
---|
186 | if (placeList == null)
|
---|
187 | {
|
---|
188 | continue;
|
---|
189 | }
|
---|
190 |
|
---|
191 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
192 | {
|
---|
193 | for(Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
194 | {
|
---|
195 | p.directReference();
|
---|
196 | }
|
---|
197 | }
|
---|
198 |
|
---|
199 | for (Place p : placeList)
|
---|
200 | {
|
---|
201 | p.directReference();
|
---|
202 | addScore(p, 256);
|
---|
203 | }
|
---|
204 | }
|
---|
205 |
|
---|
206 | currentWords = new StringBuilder();
|
---|
207 | }
|
---|
208 | }
|
---|
209 | file.close();
|
---|
210 | }
|
---|
211 | catch (Exception ex)
|
---|
212 | {
|
---|
213 | ex.printStackTrace();
|
---|
214 | }
|
---|
215 |
|
---|
216 | if (htmlString != null)
|
---|
217 | {
|
---|
218 | _markedUpText = html.toString();
|
---|
219 | }
|
---|
220 |
|
---|
221 | if(_places.size() > 0)
|
---|
222 | {
|
---|
223 | adjustScores();
|
---|
224 | }
|
---|
225 |
|
---|
226 | return _places;
|
---|
227 | }
|
---|
228 |
|
---|
229 | public ArrayList<Place> examineTextWithoutGate(ArrayList<ArrayList<String>> lines)
|
---|
230 | {
|
---|
231 | _places.clear();
|
---|
232 | try
|
---|
233 | {
|
---|
234 | // Stores the current words being examined
|
---|
235 | StringBuilder currentWords = new StringBuilder();
|
---|
236 |
|
---|
237 | // Read each line from the file
|
---|
238 | for(ArrayList<String> words : lines)
|
---|
239 | {
|
---|
240 | // Examine the words
|
---|
241 | for (int j = 0; j < words.size(); j++)
|
---|
242 | {
|
---|
243 | // If the word does not begin with an uppercase letter then
|
---|
244 | // ignore it
|
---|
245 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
246 | {
|
---|
247 | continue;
|
---|
248 | }
|
---|
249 |
|
---|
250 | // Used to store a good place name
|
---|
251 | String lastGoodPlaceName = null;
|
---|
252 |
|
---|
253 | // Add the first word to the list of words to be examined
|
---|
254 | currentWords.append(words.get(j));
|
---|
255 |
|
---|
256 | // While the gazetteer does not reach a dead end, add
|
---|
257 | // another word and examine them
|
---|
258 | int count = 1;
|
---|
259 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
260 | {
|
---|
261 | // If the words are a place name then store it
|
---|
262 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
263 | {
|
---|
264 | lastGoodPlaceName = currentWords.toString();
|
---|
265 | // System.out.println("Current place name part => "
|
---|
266 | // + lastGoodPlaceName);
|
---|
267 | }
|
---|
268 |
|
---|
269 | // If it is not the end of the words the add the next
|
---|
270 | // word
|
---|
271 | if (j + count < words.size())
|
---|
272 | {
|
---|
273 | currentWords.append(" " + words.get(j + count++));
|
---|
274 | }
|
---|
275 | else
|
---|
276 | {
|
---|
277 | break;
|
---|
278 | }
|
---|
279 | }
|
---|
280 |
|
---|
281 | // If there was a place name found then find its information
|
---|
282 | // and score it
|
---|
283 | if (lastGoodPlaceName != null)
|
---|
284 | {
|
---|
285 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
286 |
|
---|
287 | if (placeList == null)
|
---|
288 | {
|
---|
289 | continue;
|
---|
290 | }
|
---|
291 |
|
---|
292 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
293 | {
|
---|
294 | for(Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
295 | {
|
---|
296 | p.directReference();
|
---|
297 | }
|
---|
298 | }
|
---|
299 |
|
---|
300 | for (Place p : placeList)
|
---|
301 | {
|
---|
302 | p.directReference();
|
---|
303 | addScore(p, 256);
|
---|
304 | }
|
---|
305 | }
|
---|
306 | currentWords = new StringBuilder();
|
---|
307 | }
|
---|
308 | }
|
---|
309 | }
|
---|
310 | catch (Exception ex)
|
---|
311 | {
|
---|
312 | ex.printStackTrace();
|
---|
313 | }
|
---|
314 |
|
---|
315 | adjustScores();
|
---|
316 |
|
---|
317 | return _places;
|
---|
318 | }
|
---|
319 |
|
---|
320 | public ArrayList<String> getPlaceNames(String text)
|
---|
321 | {
|
---|
322 | ArrayList<String> placeNames = new ArrayList<String>();
|
---|
323 | _places.clear();
|
---|
324 | try
|
---|
325 | {
|
---|
326 | // The file to read from
|
---|
327 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
328 |
|
---|
329 | // Stores the current words being examined
|
---|
330 | StringBuilder currentWords = new StringBuilder();
|
---|
331 |
|
---|
332 | // Stores the current line being examined
|
---|
333 | String currentLine = null;
|
---|
334 |
|
---|
335 | // The list of words in the line
|
---|
336 | ArrayList<String> words = new ArrayList<String>();
|
---|
337 |
|
---|
338 | // Read each line from the file
|
---|
339 | while ((currentLine = file.readLine()) != null)
|
---|
340 | {
|
---|
341 | words.clear();
|
---|
342 |
|
---|
343 | // Find the words in those lines and add them to the list
|
---|
344 | words.addAll(MarkupService.findWords(currentLine));
|
---|
345 |
|
---|
346 | // Examine the words
|
---|
347 | for (int j = 0; j < words.size(); j++)
|
---|
348 | {
|
---|
349 | // If the word does not begin with an uppercase letter then
|
---|
350 | // ignore it
|
---|
351 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
352 | {
|
---|
353 | continue;
|
---|
354 | }
|
---|
355 |
|
---|
356 | // Used to store a good place name
|
---|
357 | String lastGoodPlaceName = null;
|
---|
358 |
|
---|
359 | // Add the first word to the list of words to be examined
|
---|
360 | currentWords.append(words.get(j));
|
---|
361 |
|
---|
362 | // While the gazetteer does not reach a dead end, add
|
---|
363 | // another word and examine them
|
---|
364 | int count = 1;
|
---|
365 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
366 | {
|
---|
367 | // If the words are a place name then store it
|
---|
368 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
369 | {
|
---|
370 | lastGoodPlaceName = currentWords.toString();
|
---|
371 | // System.out.println("Current place name part => "
|
---|
372 | // + lastGoodPlaceName);
|
---|
373 | }
|
---|
374 |
|
---|
375 | // If it is not the end of the words the add the next
|
---|
376 | // word
|
---|
377 | if (j + count < words.size())
|
---|
378 | {
|
---|
379 | currentWords.append(" " + words.get(j + count++));
|
---|
380 | }
|
---|
381 | else
|
---|
382 | {
|
---|
383 | break;
|
---|
384 | }
|
---|
385 | }
|
---|
386 |
|
---|
387 | // If there was a place name found then find its information
|
---|
388 | // and score it
|
---|
389 | if (lastGoodPlaceName != null)
|
---|
390 | {
|
---|
391 | placeNames.add(lastGoodPlaceName);
|
---|
392 | }
|
---|
393 | currentWords = new StringBuilder();
|
---|
394 | }
|
---|
395 | }
|
---|
396 | file.close();
|
---|
397 | }
|
---|
398 | catch (Exception ex)
|
---|
399 | {
|
---|
400 | ex.printStackTrace();
|
---|
401 | }
|
---|
402 |
|
---|
403 | return placeNames;
|
---|
404 | }
|
---|
405 |
|
---|
406 | public ArrayList<Place> examineArrayOfStrings(ArrayList<String> placeNames)
|
---|
407 | {
|
---|
408 | _places.clear();
|
---|
409 | for(int i = 0; i < placeNames.size(); i++)
|
---|
410 | {
|
---|
411 | String currentPlaceName = placeNames.get(i);
|
---|
412 |
|
---|
413 | ArrayList<Place> placeList = null;
|
---|
414 | if(_placeCache.containsKey(currentPlaceName))
|
---|
415 | {
|
---|
416 | placeList = _placeCache.get(currentPlaceName);
|
---|
417 | for(Place p : placeList)
|
---|
418 | {
|
---|
419 | p.unDirectReference();
|
---|
420 | }
|
---|
421 | }
|
---|
422 | else
|
---|
423 | {
|
---|
424 | placeList = PlaceInformation.getPlaces(currentPlaceName);
|
---|
425 | _placeCache.put(currentPlaceName, placeList);
|
---|
426 | }
|
---|
427 |
|
---|
428 | if (placeList == null)
|
---|
429 | {
|
---|
430 | continue;
|
---|
431 | }
|
---|
432 |
|
---|
433 | if (_placeNameMap.containsKey(currentPlaceName))
|
---|
434 | {
|
---|
435 | for(Place p : _placeNameMap.get(currentPlaceName))
|
---|
436 | {
|
---|
437 | p.directReference();
|
---|
438 | }
|
---|
439 | }
|
---|
440 |
|
---|
441 | for (Place p : placeList)
|
---|
442 | {
|
---|
443 | p.directReference();
|
---|
444 | addScore(p, 256);
|
---|
445 | }
|
---|
446 | }
|
---|
447 |
|
---|
448 | // System.out.println("Done!");
|
---|
449 | // System.out.print("Adjusting Scores... ");
|
---|
450 |
|
---|
451 | adjustScores();
|
---|
452 |
|
---|
453 | // System.out.println("Done!");
|
---|
454 | // System.out.print("Sorting Scores... ");
|
---|
455 |
|
---|
456 | // System.out.println("Done!");
|
---|
457 |
|
---|
458 | // System.out.println("Places found = " + _places);
|
---|
459 |
|
---|
460 | return _places;
|
---|
461 | }
|
---|
462 |
|
---|
463 | public ArrayList<Place> examineTextWithoutGate(String text)
|
---|
464 | {
|
---|
465 | _places.clear();
|
---|
466 | try
|
---|
467 | {
|
---|
468 | // The file to read from
|
---|
469 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
470 |
|
---|
471 | // Stores the current words being examined
|
---|
472 | StringBuilder currentWords = new StringBuilder();
|
---|
473 |
|
---|
474 | // Stores the current line being examined
|
---|
475 | String currentLine = null;
|
---|
476 |
|
---|
477 | // The list of words in the line
|
---|
478 | ArrayList<String> words = new ArrayList<String>();
|
---|
479 |
|
---|
480 | // Read each line from the file
|
---|
481 | while ((currentLine = file.readLine()) != null)
|
---|
482 | {
|
---|
483 | words.clear();
|
---|
484 |
|
---|
485 | // Find the words in those lines and add them to the list
|
---|
486 | words.addAll(MarkupService.findWords(currentLine));
|
---|
487 |
|
---|
488 | // Examine the words
|
---|
489 | for (int j = 0; j < words.size(); j++)
|
---|
490 | {
|
---|
491 | // If the word does not begin with an uppercase letter then
|
---|
492 | // ignore it
|
---|
493 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
494 | {
|
---|
495 | continue;
|
---|
496 | }
|
---|
497 |
|
---|
498 | // Used to store a good place name
|
---|
499 | String lastGoodPlaceName = null;
|
---|
500 |
|
---|
501 | // Add the first word to the list of words to be examined
|
---|
502 | currentWords.append(words.get(j));
|
---|
503 |
|
---|
504 | // While the gazetteer does not reach a dead end, add
|
---|
505 | // another word and examine them
|
---|
506 | int count = 1;
|
---|
507 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
508 | {
|
---|
509 | // If the words are a place name then store it
|
---|
510 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
511 | {
|
---|
512 | lastGoodPlaceName = currentWords.toString();
|
---|
513 | // System.out.println("Current place name part => "
|
---|
514 | // + lastGoodPlaceName);
|
---|
515 | }
|
---|
516 |
|
---|
517 | // If it is not the end of the words the add the next
|
---|
518 | // word
|
---|
519 | if (j + count < words.size())
|
---|
520 | {
|
---|
521 | currentWords.append(" " + words.get(j + count++));
|
---|
522 | }
|
---|
523 | else
|
---|
524 | {
|
---|
525 | break;
|
---|
526 | }
|
---|
527 | }
|
---|
528 |
|
---|
529 | // If there was a place name found then find its information
|
---|
530 | // and score it
|
---|
531 | if (lastGoodPlaceName != null)
|
---|
532 | {
|
---|
533 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
534 |
|
---|
535 | if (placeList == null)
|
---|
536 | {
|
---|
537 | continue;
|
---|
538 | }
|
---|
539 |
|
---|
540 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
541 | {
|
---|
542 | for(Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
543 | {
|
---|
544 | p.directReference();
|
---|
545 | }
|
---|
546 | }
|
---|
547 |
|
---|
548 | for (Place p : placeList)
|
---|
549 | {
|
---|
550 | p.directReference();
|
---|
551 | addScore(p, 256);
|
---|
552 | }
|
---|
553 | }
|
---|
554 | currentWords = new StringBuilder();
|
---|
555 | }
|
---|
556 | }
|
---|
557 | file.close();
|
---|
558 | }
|
---|
559 | catch (Exception ex)
|
---|
560 | {
|
---|
561 | ex.printStackTrace();
|
---|
562 | }
|
---|
563 |
|
---|
564 | // System.out.println("Done!");
|
---|
565 | // System.out.print("Adjusting Scores... ");
|
---|
566 |
|
---|
567 | adjustScores();
|
---|
568 |
|
---|
569 | // System.out.println("Done!");
|
---|
570 | // System.out.print("Sorting Scores... ");
|
---|
571 | // System.out.println("Done!");
|
---|
572 | // System.out.println("Places found = " + _places);
|
---|
573 |
|
---|
574 | return _places;
|
---|
575 | }
|
---|
576 |
|
---|
577 | public String getMarkedUpText()
|
---|
578 | {
|
---|
579 | return _markedUpText;
|
---|
580 | }
|
---|
581 |
|
---|
582 | public Place getTopScorePlace()
|
---|
583 | {
|
---|
584 | Place p = _places.get(0);
|
---|
585 |
|
---|
586 | for(Place pp : _places)
|
---|
587 | {
|
---|
588 | if(pp.getScore() > p.getScore())
|
---|
589 | {
|
---|
590 | p = pp;
|
---|
591 | }
|
---|
592 | }
|
---|
593 | return p;
|
---|
594 | }
|
---|
595 |
|
---|
596 | /**
|
---|
597 | * Used to make the original place scores more accurate by using other place
|
---|
598 | * information
|
---|
599 | */
|
---|
600 | public void adjustScores()
|
---|
601 | {
|
---|
602 | for(Place p : _places)
|
---|
603 | {
|
---|
604 | if(!p.isDirectlyReferenced())
|
---|
605 | {
|
---|
606 | p.setScore((int)(p.getScore() * _penaltyPercentage));
|
---|
607 | }
|
---|
608 | }
|
---|
609 |
|
---|
610 | Place topScore = getTopScorePlace();
|
---|
611 |
|
---|
612 | for(Place p : _places)
|
---|
613 | {
|
---|
614 | if (p.getParentPlaceName() == null)
|
---|
615 | {
|
---|
616 | continue;
|
---|
617 | }
|
---|
618 |
|
---|
619 | for (Place pp : _places)
|
---|
620 | {
|
---|
621 | if(p.isIn(pp) && (topScore.getScore() - pp.getScore()) <= (int)(topScore.getScore() * 0.1 * _parentLimitPercentage))
|
---|
622 | {
|
---|
623 | p.setScore((int)(p.getScore() + (pp.getScore() * _parentBonusPercentage)));
|
---|
624 | }
|
---|
625 | }
|
---|
626 | }
|
---|
627 | }
|
---|
628 |
|
---|
629 | /**
|
---|
630 | * Adds one to the score of the given place
|
---|
631 | *
|
---|
632 | * @param p
|
---|
633 | * is the place to add one to the score of
|
---|
634 | */
|
---|
635 | public void addScore(Place p, Integer scoreToAdd)
|
---|
636 | {
|
---|
637 | if (p == null)
|
---|
638 | {
|
---|
639 | return;
|
---|
640 | }
|
---|
641 |
|
---|
642 | // If there is already a score for this key the increase it by one
|
---|
643 | if (_places.contains(p))
|
---|
644 | {
|
---|
645 | Place place = _places.get(_places.indexOf(p));
|
---|
646 | place.setScore(place.getScore() + scoreToAdd);
|
---|
647 | }
|
---|
648 | // If there is no score for this key then make one
|
---|
649 | else
|
---|
650 | {
|
---|
651 | p.setScore(scoreToAdd);
|
---|
652 | _places.add(p);
|
---|
653 |
|
---|
654 | if(_placeNameMap.containsKey(p.getName()))
|
---|
655 | {
|
---|
656 | _placeNameMap.get(p.getName()).add(p);
|
---|
657 | }
|
---|
658 | else
|
---|
659 | {
|
---|
660 | ArrayList<Place> placeList = new ArrayList<Place>();
|
---|
661 | _placeNameMap.put(p.getName(), placeList);
|
---|
662 | }
|
---|
663 | }
|
---|
664 |
|
---|
665 | // Add to the parent's score too (if there is one)
|
---|
666 | if (!(p.getParentPlaceName() == null))
|
---|
667 | {
|
---|
668 | // If there is an ancestor then add to it's score as well
|
---|
669 | if (p.getParentPlaceName().contains(", "))
|
---|
670 | {
|
---|
671 | String[] places = p.getParentPlaceName().split(", ");
|
---|
672 |
|
---|
673 | ArrayList<Place> specificPlaces = null;
|
---|
674 |
|
---|
675 | if (places.length == 3)
|
---|
676 | {
|
---|
677 | specificPlaces = PlaceInformation.getSpecificPlace(places[0], places[1] + ", " + places[2]);
|
---|
678 | }
|
---|
679 | else
|
---|
680 | {
|
---|
681 | specificPlaces = PlaceInformation.getSpecificPlace(places[0], places[1]);
|
---|
682 | }
|
---|
683 |
|
---|
684 | if (specificPlaces == null)
|
---|
685 | {
|
---|
686 | return;
|
---|
687 | }
|
---|
688 |
|
---|
689 | for (Place pp : specificPlaces)
|
---|
690 | {
|
---|
691 | addScore(pp, (int) (scoreToAdd * _indirectReferencePenaltyPercentage));
|
---|
692 | }
|
---|
693 | }
|
---|
694 | // Otherwise just add the parent
|
---|
695 | else
|
---|
696 | {
|
---|
697 | ArrayList<Place> specificPlaces = PlaceInformation.getSpecificPlace(p.getParentPlaceName(), null);
|
---|
698 |
|
---|
699 | if (specificPlaces == null)
|
---|
700 | {
|
---|
701 | return;
|
---|
702 | }
|
---|
703 |
|
---|
704 | for (Place pp : specificPlaces)
|
---|
705 | {
|
---|
706 | addScore(pp, (int) (scoreToAdd * _indirectReferencePenaltyPercentage));
|
---|
707 | }
|
---|
708 | }
|
---|
709 | }
|
---|
710 | }
|
---|
711 |
|
---|
712 |
|
---|
713 | public void sortScores()
|
---|
714 | {
|
---|
715 | ArrayList<Place> sortedPlaces = new ArrayList<Place>();
|
---|
716 |
|
---|
717 | while (_places.size() > 0)
|
---|
718 | {
|
---|
719 | int index = -1;
|
---|
720 | for (int j = 0; j < _places.size(); j++)
|
---|
721 | {
|
---|
722 | if (index == -1 || _places.get(j).getScore() > _places.get(index).getScore())
|
---|
723 | {
|
---|
724 | index = j;
|
---|
725 | }
|
---|
726 | }
|
---|
727 |
|
---|
728 | sortedPlaces.add(_places.remove(index));
|
---|
729 | }
|
---|
730 |
|
---|
731 | _places = sortedPlaces;
|
---|
732 | }
|
---|
733 |
|
---|
734 | public ArrayList<Place> getPlacesWithParams(double maxScorePercentage, double minScorePercentage, long minPopulation, boolean locality, boolean region, boolean country, int numberOfPlacesToGet)
|
---|
735 | {
|
---|
736 | // System.out.println("Getting places with params");
|
---|
737 |
|
---|
738 | int topScore = _places.get(0).getScore();
|
---|
739 | int minScore = (int) (topScore * minScorePercentage);
|
---|
740 | int maxScore = (int) (topScore * maxScorePercentage);
|
---|
741 |
|
---|
742 | // System.out.println("minScore = " + minScore);
|
---|
743 | // System.out.println("maxScore = " + maxScore);
|
---|
744 |
|
---|
745 | ArrayList<Place> matchingPlaces = new ArrayList<Place>();
|
---|
746 |
|
---|
747 | // Go through all the markers
|
---|
748 | for (Place p : _places)
|
---|
749 | {
|
---|
750 | // System.out.println("Testing place -> " + p.getName());
|
---|
751 | if (p == null)
|
---|
752 | {
|
---|
753 | // System.out.println("P is null?");
|
---|
754 | continue;
|
---|
755 | }
|
---|
756 |
|
---|
757 | // If the place meets the criteria
|
---|
758 | if (p.getScore() > minScore && p.getScore() <= maxScore && p.getPopulation() > minPopulation && ((locality && p.getPlaceType().equals("locality")) || (region && p.getPlaceType().equals("region")) || (country && p.getPlaceType().equals("country"))))
|
---|
759 | {
|
---|
760 | // System.out.println("MATCH!");
|
---|
761 | // If there is not already the maximum amount of visible places
|
---|
762 | // then add this
|
---|
763 | if (matchingPlaces.size() < numberOfPlacesToGet)
|
---|
764 | {
|
---|
765 | matchingPlaces.add(p);
|
---|
766 | }
|
---|
767 | // If there is already MAXVISIBLE visible places then see if
|
---|
768 | // this place should replace one
|
---|
769 | else
|
---|
770 | {
|
---|
771 | Place minScorePlace = null;
|
---|
772 | for (Place q : matchingPlaces)
|
---|
773 | {
|
---|
774 | if (minScorePlace == null)
|
---|
775 | {
|
---|
776 | minScorePlace = q;
|
---|
777 | }
|
---|
778 | if (q.getScore() < minScorePlace.getScore())
|
---|
779 | {
|
---|
780 | minScorePlace = q;
|
---|
781 | }
|
---|
782 | }
|
---|
783 |
|
---|
784 | if (p.getScore() > minScorePlace.getScore())
|
---|
785 | {
|
---|
786 | matchingPlaces.remove(minScorePlace);
|
---|
787 | matchingPlaces.add(p);
|
---|
788 | }
|
---|
789 | }
|
---|
790 | }
|
---|
791 | else
|
---|
792 | {
|
---|
793 | // System.out.println("NOT A MATCH");
|
---|
794 | }
|
---|
795 | }
|
---|
796 | return matchingPlaces;
|
---|
797 | }
|
---|
798 |
|
---|
799 | public void setScoringParams(double penaltyPercentage, double parentBonusPercentage, double indirectReferencePenaltyPercentage)
|
---|
800 | {
|
---|
801 | _penaltyPercentage = penaltyPercentage;
|
---|
802 | _parentBonusPercentage = parentBonusPercentage;
|
---|
803 | _indirectReferencePenaltyPercentage = indirectReferencePenaltyPercentage;
|
---|
804 | }
|
---|
805 |
|
---|
806 | public void setScoringParams(ScanConfiguration config)
|
---|
807 | {
|
---|
808 | _penaltyPercentage = config.getPenalty();
|
---|
809 | _parentBonusPercentage = config.getParentBonus();
|
---|
810 | _indirectReferencePenaltyPercentage = config.getIndirectReferencePenalty();
|
---|
811 | _parentLimitPercentage = config.getParentLimit();
|
---|
812 | }
|
---|
813 |
|
---|
814 | // public HashMap<String, Integer> wordCount(String fileName)
|
---|
815 | // {
|
---|
816 | // HashMap<String, Integer> wordCountMap = new HashMap<String, Integer>();
|
---|
817 | //
|
---|
818 | // try
|
---|
819 | // {
|
---|
820 | // BufferedReader file = new BufferedReader(new FileReader(fileName));
|
---|
821 | //
|
---|
822 | // StringBuilder currentWord = new StringBuilder();
|
---|
823 | //
|
---|
824 | // String line = "";
|
---|
825 | //
|
---|
826 | // ArrayList<String> words = new ArrayList<String>();
|
---|
827 | //
|
---|
828 | // //System.out.print("Finding words... ");
|
---|
829 | // while((line = file.readLine()) != null)
|
---|
830 | // {
|
---|
831 | // words.addAll(MarkupService.findWords(line));
|
---|
832 | // }
|
---|
833 | // //System.out.println("Done!");
|
---|
834 | //
|
---|
835 | // //System.out.print("Adding up scores... ");
|
---|
836 | // for(int j = 0; j < words.size(); j++)
|
---|
837 | // {
|
---|
838 | // if(Character.isLowerCase(words.get(j).charAt(0)))
|
---|
839 | // {
|
---|
840 | // //continue;
|
---|
841 | // }
|
---|
842 | //
|
---|
843 | // currentWord.append(words.get(j));
|
---|
844 | //
|
---|
845 | // int count = 1;
|
---|
846 | // while(_gazetteer.checkPlaceName(currentWord.toString()) != -1)
|
---|
847 | // {
|
---|
848 | // if(_gazetteer.checkPlaceName(currentWord.toString()) == 1)
|
---|
849 | // {
|
---|
850 | // if(wordCountMap.containsKey(currentWord.toString()))
|
---|
851 | // {
|
---|
852 | // Integer i = wordCountMap.get(currentWord.toString());
|
---|
853 | // wordCountMap.put(currentWord.toString(), ++i);
|
---|
854 | // }
|
---|
855 | // else
|
---|
856 | // {
|
---|
857 | // wordCountMap.put(currentWord.toString(), 1);
|
---|
858 | // }
|
---|
859 | // }
|
---|
860 | // currentWord.append(" " + words.get(j + count++));
|
---|
861 | // }
|
---|
862 | //
|
---|
863 | // currentWord.delete(0, currentWord.length());
|
---|
864 | // }
|
---|
865 | // }
|
---|
866 | // catch(Exception ex)
|
---|
867 | // {
|
---|
868 | // ex.printStackTrace();
|
---|
869 | // }
|
---|
870 | //
|
---|
871 | // return wordCountMap;
|
---|
872 | // }
|
---|
873 |
|
---|
874 | public boolean isGazetteerLoaded()
|
---|
875 | {
|
---|
876 | return _gazetteer != null;
|
---|
877 | }
|
---|
878 |
|
---|
879 | public ArrayList<ArrayList<Place>> examineMultipleTexts(ArrayList<String> texts)
|
---|
880 | {
|
---|
881 | ArrayList<ArrayList<Place>> multipleResults = new ArrayList<ArrayList<Place>>();
|
---|
882 | for (String text : texts)
|
---|
883 | {
|
---|
884 | if (text != null)
|
---|
885 | {
|
---|
886 | this.examineTextWithGate(text, null);
|
---|
887 |
|
---|
888 | if(_places.size() > 0)
|
---|
889 | {
|
---|
890 | multipleResults.add(new ArrayList<Place>(_places));
|
---|
891 | }
|
---|
892 | else
|
---|
893 | {
|
---|
894 | multipleResults.add(null);
|
---|
895 | }
|
---|
896 | }
|
---|
897 | else
|
---|
898 | {
|
---|
899 | multipleResults.add(null);
|
---|
900 | }
|
---|
901 | }
|
---|
902 |
|
---|
903 | return multipleResults;
|
---|
904 | }
|
---|
905 |
|
---|
906 | public ArrayList<Place> getPlaces()
|
---|
907 | {
|
---|
908 | return _places;
|
---|
909 | }
|
---|
910 |
|
---|
911 | public void clearPlaces()
|
---|
912 | {
|
---|
913 | _places.clear();
|
---|
914 | }
|
---|
915 | }
|
---|
916 |
|
---|
917 | /*
|
---|
918 | *
|
---|
919 | * // Halve the scores for places that do not have their parents
|
---|
920 | // mentioned
|
---|
921 | // ********************************************************************
|
---|
922 |
|
---|
923 | String[] parentPlaceNames = p.getParentPlaceName().split(", ");
|
---|
924 | if (parentPlaceNames.length == 1)
|
---|
925 | {
|
---|
926 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], null);
|
---|
927 | Place parent = null;
|
---|
928 |
|
---|
929 | if (parentList != null && parentList.size() > 0)
|
---|
930 | {
|
---|
931 | parent = parentList.get(0);
|
---|
932 |
|
---|
933 | if (!_places.contains(parent) || !_places.get(_places.indexOf(parent)).isDirectlyReferenced())
|
---|
934 | {
|
---|
935 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
936 | }
|
---|
937 | }
|
---|
938 | }
|
---|
939 | else if (parentPlaceNames.length == 2)
|
---|
940 | {
|
---|
941 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1]);
|
---|
942 | ArrayList<Place> ancestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], null);
|
---|
943 |
|
---|
944 | Place parent = null;
|
---|
945 | Place ancestor = null;
|
---|
946 |
|
---|
947 | if (parentList != null && parentList.size() > 0)
|
---|
948 | {
|
---|
949 | parent = parentList.get(0);
|
---|
950 |
|
---|
951 | if (!_places.contains(parent) || !_places.get(_places.indexOf(parent)).isDirectlyReferenced())
|
---|
952 | {
|
---|
953 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
954 | }
|
---|
955 | }
|
---|
956 |
|
---|
957 | if (ancestorList != null && ancestorList.size() > 0)
|
---|
958 | {
|
---|
959 | ancestor = ancestorList.get(0);
|
---|
960 |
|
---|
961 | if (!_places.contains(ancestor) || !_places.get(_places.indexOf(ancestor)).isDirectlyReferenced())
|
---|
962 | {
|
---|
963 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
964 | }
|
---|
965 | }
|
---|
966 | }
|
---|
967 | else if (parentPlaceNames.length == 3)
|
---|
968 | {
|
---|
969 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1] + ", " + parentPlaceNames[2]);
|
---|
970 | ArrayList<Place> firstAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], parentPlaceNames[2]);
|
---|
971 | ArrayList<Place> secondAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[2], null);
|
---|
972 |
|
---|
973 | Place parent = null;
|
---|
974 | Place firstAncestor = null;
|
---|
975 | Place secondAncestor = null;
|
---|
976 |
|
---|
977 | if (parentList != null && parentList.size() > 0)
|
---|
978 | {
|
---|
979 | parent = parentList.get(0);
|
---|
980 |
|
---|
981 | if (!_places.contains(parent) || !_places.get(_places.indexOf(parent)).isDirectlyReferenced())
|
---|
982 | {
|
---|
983 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
984 | }
|
---|
985 | }
|
---|
986 |
|
---|
987 | if (firstAncestorList != null && firstAncestorList.size() > 0)
|
---|
988 | {
|
---|
989 | firstAncestor = firstAncestorList.get(0);
|
---|
990 |
|
---|
991 | if (!_places.contains(firstAncestor) || !_places.get(_places.indexOf(firstAncestor)).isDirectlyReferenced())
|
---|
992 | {
|
---|
993 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
994 | }
|
---|
995 | }
|
---|
996 |
|
---|
997 | if (secondAncestorList != null && secondAncestorList.size() > 0)
|
---|
998 | {
|
---|
999 | secondAncestor = secondAncestorList.get(0);
|
---|
1000 |
|
---|
1001 | if (!_places.contains(secondAncestor) || !_places.get(_places.indexOf(secondAncestor)).isDirectlyReferenced())
|
---|
1002 | {
|
---|
1003 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
1004 | }
|
---|
1005 | }
|
---|
1006 | }
|
---|
1007 | */
|
---|
1008 | // Add part of the parent's score to the child
|
---|
1009 | // *******************************************
|
---|
1010 | /*
|
---|
1011 | if (parentPlaceNames.length == 1)
|
---|
1012 | {
|
---|
1013 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], null);
|
---|
1014 |
|
---|
1015 | Place parent = null;
|
---|
1016 |
|
---|
1017 | if (parentList != null && parentList.size() > 0)
|
---|
1018 | {
|
---|
1019 | parent = parentList.get(0);
|
---|
1020 |
|
---|
1021 | if (_places.contains(parent))
|
---|
1022 | {
|
---|
1023 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
1024 | }
|
---|
1025 | }
|
---|
1026 | }
|
---|
1027 | else if (parentPlaceNames.length == 2)
|
---|
1028 | {
|
---|
1029 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1]);
|
---|
1030 | ArrayList<Place> ancestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], null);
|
---|
1031 |
|
---|
1032 | Place parent = null;
|
---|
1033 | Place ancestor = null;
|
---|
1034 |
|
---|
1035 | if (parentList != null && parentList.size() > 0)
|
---|
1036 | {
|
---|
1037 | parent = parentList.get(0);
|
---|
1038 |
|
---|
1039 | if (_places.contains(parent))
|
---|
1040 | {
|
---|
1041 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
1042 | }
|
---|
1043 | }
|
---|
1044 |
|
---|
1045 | if (ancestorList != null && ancestorList.size() > 0)
|
---|
1046 | {
|
---|
1047 | ancestor = ancestorList.get(0);
|
---|
1048 |
|
---|
1049 | if (_places.contains(ancestor))
|
---|
1050 | {
|
---|
1051 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(ancestor)).getScore() * _parentBonusPercentage));
|
---|
1052 | }
|
---|
1053 | }
|
---|
1054 | }
|
---|
1055 | else if (parentPlaceNames.length == 3)
|
---|
1056 | {
|
---|
1057 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1] + ", " + parentPlaceNames[2]);
|
---|
1058 | ArrayList<Place> firstAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], parentPlaceNames[2]);
|
---|
1059 | ArrayList<Place> secondAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[2], null);
|
---|
1060 |
|
---|
1061 | Place parent = null;
|
---|
1062 | Place firstAncestor = null;
|
---|
1063 | Place secondAncestor = null;
|
---|
1064 |
|
---|
1065 | if (parentList != null && parentList.size() > 0)
|
---|
1066 | {
|
---|
1067 | parent = parentList.get(0);
|
---|
1068 |
|
---|
1069 | if (_places.contains(parent))
|
---|
1070 | {
|
---|
1071 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
1072 | }
|
---|
1073 | }
|
---|
1074 |
|
---|
1075 | if (firstAncestorList != null && firstAncestorList.size() > 0)
|
---|
1076 | {
|
---|
1077 | firstAncestor = firstAncestorList.get(0);
|
---|
1078 |
|
---|
1079 | if (_places.contains(firstAncestor))
|
---|
1080 | {
|
---|
1081 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(firstAncestor)).getScore() * _parentBonusPercentage));
|
---|
1082 | }
|
---|
1083 | }
|
---|
1084 |
|
---|
1085 | if (secondAncestorList != null && secondAncestorList.size() > 0)
|
---|
1086 | {
|
---|
1087 | secondAncestor = secondAncestorList.get(0);
|
---|
1088 |
|
---|
1089 | if (_places.contains(secondAncestor))
|
---|
1090 | {
|
---|
1091 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(secondAncestor)).getScore() * _parentBonusPercentage));
|
---|
1092 | }
|
---|
1093 | }
|
---|
1094 | }
|
---|
1095 | */ |
---|