1 | package org.greenstone.server;
|
---|
2 |
|
---|
3 | import java.io.BufferedReader;
|
---|
4 | import java.io.StringReader;
|
---|
5 | import java.util.ArrayList;
|
---|
6 | import java.util.HashMap;
|
---|
7 |
|
---|
8 | import org.greenstone.client.Place;
|
---|
9 |
|
---|
10 | import com.google.apphosting.utils.servlet.DatastoreViewerServlet.Page;
|
---|
11 |
|
---|
12 | public class PageScanner
|
---|
13 | {
|
---|
14 | // Stores the gazeteer structure (used to verify if words are a place name
|
---|
15 | // or not)
|
---|
16 | protected GazetteerTrieType2 _gazetteer = null;
|
---|
17 |
|
---|
18 | // Stores all of the places in the page that is being examined
|
---|
19 | protected ArrayList<Place> _places = new ArrayList<Place>();
|
---|
20 | protected HashMap<String, ArrayList<Place>> _placeNameMap = new HashMap<String, ArrayList<Place>>();
|
---|
21 | protected String _markedUpText = null;
|
---|
22 | protected HashMap<String, ArrayList<Place>> _placeCache = new HashMap<String, ArrayList<Place>>();
|
---|
23 |
|
---|
24 | protected GateScanner _gateScanner = new GateScanner();
|
---|
25 |
|
---|
26 | // Parameters for score calculations
|
---|
27 | // *********************************
|
---|
28 | protected double _penaltyPercentage = 0.5;
|
---|
29 | protected double _parentBonusPercentage = 0.25;
|
---|
30 | protected double _indirectReferencePenaltyPercentage = 0.25;
|
---|
31 | protected double _parentLimitPercentage = 0.05;
|
---|
32 |
|
---|
33 | protected ArrayList<String> _prevDoc = null;
|
---|
34 | protected String _prevFileName = null;
|
---|
35 |
|
---|
36 | /**
|
---|
37 | * Default constructer. It creates the place data structure and gazetteer
|
---|
38 | * trie structure
|
---|
39 | */
|
---|
40 | public PageScanner(String path)
|
---|
41 | {
|
---|
42 | System.out.println("Loading path = " + path);
|
---|
43 | try
|
---|
44 | {
|
---|
45 | PlaceInformation.init();
|
---|
46 | }
|
---|
47 | catch (Exception ex)
|
---|
48 | {
|
---|
49 | ex.printStackTrace();
|
---|
50 | }
|
---|
51 | System.out.println("Starting loading gazetteer");
|
---|
52 | _gazetteer = new GazetteerTrieType2(path + "/dataen.txt");
|
---|
53 | }
|
---|
54 |
|
---|
55 | /**
|
---|
56 | * Examines the given text to find place names and score them
|
---|
57 | *
|
---|
58 | * @param text
|
---|
59 | * is the text to examine
|
---|
60 | */
|
---|
61 | public ArrayList<Place> examineTextWithGate(String text, String htmlString)
|
---|
62 | {
|
---|
63 | System.out.println(htmlString);
|
---|
64 | // System.out.println("Examining text");
|
---|
65 |
|
---|
66 | StringBuilder html = null;
|
---|
67 | if (htmlString != null)
|
---|
68 | {
|
---|
69 | html = new StringBuilder(htmlString);
|
---|
70 | }
|
---|
71 |
|
---|
72 | _places.clear();
|
---|
73 | try
|
---|
74 | {
|
---|
75 | ArrayList<Word> classifiedWords = _gateScanner.classifyText(text);
|
---|
76 |
|
---|
77 | // The file to read from
|
---|
78 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
79 |
|
---|
80 | // Stores the current words being examined
|
---|
81 | StringBuilder currentWords = new StringBuilder();
|
---|
82 |
|
---|
83 | // Stores the current line being examined
|
---|
84 | String currentLine = null;
|
---|
85 |
|
---|
86 | int htmlIndex = 0;
|
---|
87 | int textIndex = 0;
|
---|
88 | int cwindex = 0;
|
---|
89 |
|
---|
90 | // The list of words in the line
|
---|
91 | ArrayList<String> words = new ArrayList<String>();
|
---|
92 |
|
---|
93 | // Read each line from the file
|
---|
94 | while ((currentLine = file.readLine()) != null)
|
---|
95 | {
|
---|
96 | words.clear();
|
---|
97 |
|
---|
98 | // Find the words in those lines and add them to the list
|
---|
99 | words.addAll(MarkupService.findWords(currentLine));
|
---|
100 |
|
---|
101 | // Examine the words
|
---|
102 | for (int j = 0; j < words.size(); j++)
|
---|
103 | {
|
---|
104 | int oldHTMLIndex = 0;
|
---|
105 | if (htmlString != null)
|
---|
106 | {
|
---|
107 | oldHTMLIndex = htmlIndex;
|
---|
108 | htmlIndex = html.indexOf(words.get(j), htmlIndex);
|
---|
109 | }
|
---|
110 | textIndex = text.indexOf(words.get(j), textIndex);
|
---|
111 | Word currentWord = null;
|
---|
112 | int cwindexBefore = cwindex;
|
---|
113 | for (; cwindex < classifiedWords.size(); cwindex++)
|
---|
114 | {
|
---|
115 | if (textIndex > 0 && text.charAt(textIndex - 1) == '\'')
|
---|
116 | {
|
---|
117 | break;
|
---|
118 | }
|
---|
119 |
|
---|
120 | //System.out.println("Looking for \"" + words.get(j) + "\" got \"" + classifiedWords.get(cwindex) + "\"");
|
---|
121 | if (classifiedWords.get(cwindex).getValue().equals(words.get(j)))
|
---|
122 | {
|
---|
123 | currentWord = classifiedWords.get(cwindex);
|
---|
124 | break;
|
---|
125 | }
|
---|
126 | }
|
---|
127 |
|
---|
128 | if (currentWord == null)
|
---|
129 | {
|
---|
130 | cwindex = cwindexBefore;
|
---|
131 | continue;
|
---|
132 | }
|
---|
133 |
|
---|
134 | // If the word does not begin with an uppercase letter then ignore it
|
---|
135 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))) || !(currentWord.getClassification().equals("NNP")))
|
---|
136 | {
|
---|
137 | continue;
|
---|
138 | }
|
---|
139 |
|
---|
140 | // Used to store a good place name
|
---|
141 | String lastGoodPlaceName = null;
|
---|
142 |
|
---|
143 | // Add the first word to the list of words to be examined
|
---|
144 | currentWords.append(words.get(j));
|
---|
145 |
|
---|
146 | // While the gazetteer does not reach a dead end, add
|
---|
147 | // another word and examine them
|
---|
148 | int count = 1;
|
---|
149 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
150 | {
|
---|
151 | // If the words are a place name then store it
|
---|
152 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
153 | {
|
---|
154 | lastGoodPlaceName = currentWords.toString();
|
---|
155 | }
|
---|
156 |
|
---|
157 | // If it is not the end of the words the add the next
|
---|
158 | // word
|
---|
159 | if (j + count < words.size())
|
---|
160 | {
|
---|
161 | currentWords.append(" " + words.get(j + count++));
|
---|
162 | }
|
---|
163 | else
|
---|
164 | {
|
---|
165 | break;
|
---|
166 | }
|
---|
167 | }
|
---|
168 |
|
---|
169 | // If there was a place name found then find its information
|
---|
170 | // and score it
|
---|
171 | if (lastGoodPlaceName != null)
|
---|
172 | {
|
---|
173 | if (htmlString != null)
|
---|
174 | {
|
---|
175 | if (htmlIndex == -1)
|
---|
176 | {
|
---|
177 | htmlIndex = oldHTMLIndex;
|
---|
178 | }
|
---|
179 | else
|
---|
180 | {
|
---|
181 | html.insert(htmlIndex, "<span class=\"place\">");
|
---|
182 | html.insert(htmlIndex + "<span class=\"place\">".length() + lastGoodPlaceName.length(), "</span>");
|
---|
183 | }
|
---|
184 | }
|
---|
185 |
|
---|
186 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
187 |
|
---|
188 | if (placeList == null)
|
---|
189 | {
|
---|
190 | continue;
|
---|
191 | }
|
---|
192 |
|
---|
193 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
194 | {
|
---|
195 | for(Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
196 | {
|
---|
197 | p.directReference();
|
---|
198 | }
|
---|
199 | }
|
---|
200 |
|
---|
201 | for (Place p : placeList)
|
---|
202 | {
|
---|
203 | p.directReference();
|
---|
204 | addScore(p, 256);
|
---|
205 | }
|
---|
206 | }
|
---|
207 |
|
---|
208 | currentWords = new StringBuilder();
|
---|
209 | }
|
---|
210 | }
|
---|
211 | file.close();
|
---|
212 | }
|
---|
213 | catch (Exception ex)
|
---|
214 | {
|
---|
215 | ex.printStackTrace();
|
---|
216 | }
|
---|
217 |
|
---|
218 | if (htmlString != null)
|
---|
219 | {
|
---|
220 | _markedUpText = html.toString();
|
---|
221 | }
|
---|
222 |
|
---|
223 | if(_places.size() > 0)
|
---|
224 | {
|
---|
225 | adjustScores();
|
---|
226 | }
|
---|
227 |
|
---|
228 | return _places;
|
---|
229 | }
|
---|
230 |
|
---|
231 | public ArrayList<Place> examineTextWithoutGate(ArrayList<ArrayList<String>> lines)
|
---|
232 | {
|
---|
233 | _places.clear();
|
---|
234 | try
|
---|
235 | {
|
---|
236 | // Stores the current words being examined
|
---|
237 | StringBuilder currentWords = new StringBuilder();
|
---|
238 |
|
---|
239 | // Read each line from the file
|
---|
240 | for(ArrayList<String> words : lines)
|
---|
241 | {
|
---|
242 | // Examine the words
|
---|
243 | for (int j = 0; j < words.size(); j++)
|
---|
244 | {
|
---|
245 | // If the word does not begin with an uppercase letter then
|
---|
246 | // ignore it
|
---|
247 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
248 | {
|
---|
249 | continue;
|
---|
250 | }
|
---|
251 |
|
---|
252 | // Used to store a good place name
|
---|
253 | String lastGoodPlaceName = null;
|
---|
254 |
|
---|
255 | // Add the first word to the list of words to be examined
|
---|
256 | currentWords.append(words.get(j));
|
---|
257 |
|
---|
258 | // While the gazetteer does not reach a dead end, add
|
---|
259 | // another word and examine them
|
---|
260 | int count = 1;
|
---|
261 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
262 | {
|
---|
263 | // If the words are a place name then store it
|
---|
264 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
265 | {
|
---|
266 | lastGoodPlaceName = currentWords.toString();
|
---|
267 | // System.out.println("Current place name part => "
|
---|
268 | // + lastGoodPlaceName);
|
---|
269 | }
|
---|
270 |
|
---|
271 | // If it is not the end of the words the add the next
|
---|
272 | // word
|
---|
273 | if (j + count < words.size())
|
---|
274 | {
|
---|
275 | currentWords.append(" " + words.get(j + count++));
|
---|
276 | }
|
---|
277 | else
|
---|
278 | {
|
---|
279 | break;
|
---|
280 | }
|
---|
281 | }
|
---|
282 |
|
---|
283 | // If there was a place name found then find its information
|
---|
284 | // and score it
|
---|
285 | if (lastGoodPlaceName != null)
|
---|
286 | {
|
---|
287 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
288 |
|
---|
289 | if (placeList == null)
|
---|
290 | {
|
---|
291 | continue;
|
---|
292 | }
|
---|
293 |
|
---|
294 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
295 | {
|
---|
296 | for(Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
297 | {
|
---|
298 | p.directReference();
|
---|
299 | }
|
---|
300 | }
|
---|
301 |
|
---|
302 | for (Place p : placeList)
|
---|
303 | {
|
---|
304 | p.directReference();
|
---|
305 | addScore(p, 256);
|
---|
306 | }
|
---|
307 | }
|
---|
308 | currentWords = new StringBuilder();
|
---|
309 | }
|
---|
310 | }
|
---|
311 | }
|
---|
312 | catch (Exception ex)
|
---|
313 | {
|
---|
314 | ex.printStackTrace();
|
---|
315 | }
|
---|
316 |
|
---|
317 | adjustScores();
|
---|
318 |
|
---|
319 | return _places;
|
---|
320 | }
|
---|
321 |
|
---|
322 | public ArrayList<String> getPlaceNames(String text)
|
---|
323 | {
|
---|
324 | ArrayList<String> placeNames = new ArrayList<String>();
|
---|
325 | _places.clear();
|
---|
326 | try
|
---|
327 | {
|
---|
328 | // The file to read from
|
---|
329 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
330 |
|
---|
331 | // Stores the current words being examined
|
---|
332 | StringBuilder currentWords = new StringBuilder();
|
---|
333 |
|
---|
334 | // Stores the current line being examined
|
---|
335 | String currentLine = null;
|
---|
336 |
|
---|
337 | // The list of words in the line
|
---|
338 | ArrayList<String> words = new ArrayList<String>();
|
---|
339 |
|
---|
340 | // Read each line from the file
|
---|
341 | while ((currentLine = file.readLine()) != null)
|
---|
342 | {
|
---|
343 | words.clear();
|
---|
344 |
|
---|
345 | // Find the words in those lines and add them to the list
|
---|
346 | words.addAll(MarkupService.findWords(currentLine));
|
---|
347 |
|
---|
348 | // Examine the words
|
---|
349 | for (int j = 0; j < words.size(); j++)
|
---|
350 | {
|
---|
351 | // If the word does not begin with an uppercase letter then
|
---|
352 | // ignore it
|
---|
353 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
354 | {
|
---|
355 | continue;
|
---|
356 | }
|
---|
357 |
|
---|
358 | // Used to store a good place name
|
---|
359 | String lastGoodPlaceName = null;
|
---|
360 |
|
---|
361 | // Add the first word to the list of words to be examined
|
---|
362 | currentWords.append(words.get(j));
|
---|
363 |
|
---|
364 | // While the gazetteer does not reach a dead end, add
|
---|
365 | // another word and examine them
|
---|
366 | int count = 1;
|
---|
367 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
368 | {
|
---|
369 | // If the words are a place name then store it
|
---|
370 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
371 | {
|
---|
372 | lastGoodPlaceName = currentWords.toString();
|
---|
373 | // System.out.println("Current place name part => "
|
---|
374 | // + lastGoodPlaceName);
|
---|
375 | }
|
---|
376 |
|
---|
377 | // If it is not the end of the words the add the next
|
---|
378 | // word
|
---|
379 | if (j + count < words.size())
|
---|
380 | {
|
---|
381 | currentWords.append(" " + words.get(j + count++));
|
---|
382 | }
|
---|
383 | else
|
---|
384 | {
|
---|
385 | break;
|
---|
386 | }
|
---|
387 | }
|
---|
388 |
|
---|
389 | // If there was a place name found then find its information
|
---|
390 | // and score it
|
---|
391 | if (lastGoodPlaceName != null)
|
---|
392 | {
|
---|
393 | placeNames.add(lastGoodPlaceName);
|
---|
394 | }
|
---|
395 | currentWords = new StringBuilder();
|
---|
396 | }
|
---|
397 | }
|
---|
398 | file.close();
|
---|
399 | }
|
---|
400 | catch (Exception ex)
|
---|
401 | {
|
---|
402 | ex.printStackTrace();
|
---|
403 | }
|
---|
404 |
|
---|
405 | return placeNames;
|
---|
406 | }
|
---|
407 |
|
---|
408 | public ArrayList<Place> examineArrayOfStrings(ArrayList<String> placeNames)
|
---|
409 | {
|
---|
410 | _places.clear();
|
---|
411 | for(int i = 0; i < placeNames.size(); i++)
|
---|
412 | {
|
---|
413 | String currentPlaceName = placeNames.get(i);
|
---|
414 |
|
---|
415 | ArrayList<Place> placeList = null;
|
---|
416 | if(_placeCache.containsKey(currentPlaceName))
|
---|
417 | {
|
---|
418 | placeList = _placeCache.get(currentPlaceName);
|
---|
419 | for(Place p : placeList)
|
---|
420 | {
|
---|
421 | p.unDirectReference();
|
---|
422 | }
|
---|
423 | }
|
---|
424 | else
|
---|
425 | {
|
---|
426 | placeList = PlaceInformation.getPlaces(currentPlaceName);
|
---|
427 | _placeCache.put(currentPlaceName, placeList);
|
---|
428 | }
|
---|
429 |
|
---|
430 | if (placeList == null)
|
---|
431 | {
|
---|
432 | continue;
|
---|
433 | }
|
---|
434 |
|
---|
435 | if (_placeNameMap.containsKey(currentPlaceName))
|
---|
436 | {
|
---|
437 | for(Place p : _placeNameMap.get(currentPlaceName))
|
---|
438 | {
|
---|
439 | p.directReference();
|
---|
440 | }
|
---|
441 | }
|
---|
442 |
|
---|
443 | for (Place p : placeList)
|
---|
444 | {
|
---|
445 | p.directReference();
|
---|
446 | addScore(p, 256);
|
---|
447 | }
|
---|
448 | }
|
---|
449 |
|
---|
450 | // System.out.println("Done!");
|
---|
451 | // System.out.print("Adjusting Scores... ");
|
---|
452 |
|
---|
453 | adjustScores();
|
---|
454 |
|
---|
455 | // System.out.println("Done!");
|
---|
456 | // System.out.print("Sorting Scores... ");
|
---|
457 |
|
---|
458 | // System.out.println("Done!");
|
---|
459 |
|
---|
460 | // System.out.println("Places found = " + _places);
|
---|
461 |
|
---|
462 | return _places;
|
---|
463 | }
|
---|
464 |
|
---|
465 | public ArrayList<Place> examineTextWithoutGate(String text)
|
---|
466 | {
|
---|
467 | _places.clear();
|
---|
468 | try
|
---|
469 | {
|
---|
470 | // The file to read from
|
---|
471 | BufferedReader file = new BufferedReader(new StringReader(text));
|
---|
472 |
|
---|
473 | // Stores the current words being examined
|
---|
474 | StringBuilder currentWords = new StringBuilder();
|
---|
475 |
|
---|
476 | // Stores the current line being examined
|
---|
477 | String currentLine = null;
|
---|
478 |
|
---|
479 | // The list of words in the line
|
---|
480 | ArrayList<String> words = new ArrayList<String>();
|
---|
481 |
|
---|
482 | // Read each line from the file
|
---|
483 | while ((currentLine = file.readLine()) != null)
|
---|
484 | {
|
---|
485 | words.clear();
|
---|
486 |
|
---|
487 | // Find the words in those lines and add them to the list
|
---|
488 | words.addAll(MarkupService.findWords(currentLine));
|
---|
489 |
|
---|
490 | // Examine the words
|
---|
491 | for (int j = 0; j < words.size(); j++)
|
---|
492 | {
|
---|
493 | // If the word does not begin with an uppercase letter then
|
---|
494 | // ignore it
|
---|
495 | if (Character.isLowerCase(words.get(j).charAt(0)) || !(words.get(j).length() > 1 && Character.isLowerCase(words.get(j).charAt(1))))
|
---|
496 | {
|
---|
497 | continue;
|
---|
498 | }
|
---|
499 |
|
---|
500 | // Used to store a good place name
|
---|
501 | String lastGoodPlaceName = null;
|
---|
502 |
|
---|
503 | // Add the first word to the list of words to be examined
|
---|
504 | currentWords.append(words.get(j));
|
---|
505 |
|
---|
506 | // While the gazetteer does not reach a dead end, add
|
---|
507 | // another word and examine them
|
---|
508 | int count = 1;
|
---|
509 | while (_gazetteer.checkPlaceName(currentWords.toString()) != -1)
|
---|
510 | {
|
---|
511 | // If the words are a place name then store it
|
---|
512 | if (_gazetteer.checkPlaceName(currentWords.toString()) == 1)
|
---|
513 | {
|
---|
514 | lastGoodPlaceName = currentWords.toString();
|
---|
515 | // System.out.println("Current place name part => "
|
---|
516 | // + lastGoodPlaceName);
|
---|
517 | }
|
---|
518 |
|
---|
519 | // If it is not the end of the words the add the next
|
---|
520 | // word
|
---|
521 | if (j + count < words.size())
|
---|
522 | {
|
---|
523 | currentWords.append(" " + words.get(j + count++));
|
---|
524 | }
|
---|
525 | else
|
---|
526 | {
|
---|
527 | break;
|
---|
528 | }
|
---|
529 | }
|
---|
530 |
|
---|
531 | // If there was a place name found then find its information
|
---|
532 | // and score it
|
---|
533 | if (lastGoodPlaceName != null)
|
---|
534 | {
|
---|
535 | ArrayList<Place> placeList = PlaceInformation.getPlaces(lastGoodPlaceName);
|
---|
536 |
|
---|
537 | if (placeList == null)
|
---|
538 | {
|
---|
539 | continue;
|
---|
540 | }
|
---|
541 |
|
---|
542 | if (_placeNameMap.containsKey(lastGoodPlaceName))
|
---|
543 | {
|
---|
544 | for(Place p : _placeNameMap.get(lastGoodPlaceName))
|
---|
545 | {
|
---|
546 | p.directReference();
|
---|
547 | }
|
---|
548 | }
|
---|
549 |
|
---|
550 | for (Place p : placeList)
|
---|
551 | {
|
---|
552 | p.directReference();
|
---|
553 | addScore(p, 256);
|
---|
554 | }
|
---|
555 | }
|
---|
556 | currentWords = new StringBuilder();
|
---|
557 | }
|
---|
558 | }
|
---|
559 | file.close();
|
---|
560 | }
|
---|
561 | catch (Exception ex)
|
---|
562 | {
|
---|
563 | ex.printStackTrace();
|
---|
564 | }
|
---|
565 |
|
---|
566 | // System.out.println("Done!");
|
---|
567 | // System.out.print("Adjusting Scores... ");
|
---|
568 |
|
---|
569 | adjustScores();
|
---|
570 |
|
---|
571 | // System.out.println("Done!");
|
---|
572 | // System.out.print("Sorting Scores... ");
|
---|
573 | // System.out.println("Done!");
|
---|
574 | // System.out.println("Places found = " + _places);
|
---|
575 |
|
---|
576 | return _places;
|
---|
577 | }
|
---|
578 |
|
---|
579 | public String getMarkedUpText()
|
---|
580 | {
|
---|
581 | return _markedUpText;
|
---|
582 | }
|
---|
583 |
|
---|
584 | public Place getTopScorePlace()
|
---|
585 | {
|
---|
586 | Place p = _places.get(0);
|
---|
587 |
|
---|
588 | for(Place pp : _places)
|
---|
589 | {
|
---|
590 | if(pp.getScore() > p.getScore())
|
---|
591 | {
|
---|
592 | p = pp;
|
---|
593 | }
|
---|
594 | }
|
---|
595 | return p;
|
---|
596 | }
|
---|
597 |
|
---|
598 | /**
|
---|
599 | * Used to make the original place scores more accurate by using other place
|
---|
600 | * information
|
---|
601 | */
|
---|
602 | public void adjustScores()
|
---|
603 | {
|
---|
604 | for(Place p : _places)
|
---|
605 | {
|
---|
606 | if(!p.isDirectlyReferenced())
|
---|
607 | {
|
---|
608 | p.setScore((int)(p.getScore() * _penaltyPercentage));
|
---|
609 | }
|
---|
610 | }
|
---|
611 |
|
---|
612 | Place topScore = getTopScorePlace();
|
---|
613 |
|
---|
614 | for(Place p : _places)
|
---|
615 | {
|
---|
616 | if (p.getParentPlaceName() == null)
|
---|
617 | {
|
---|
618 | continue;
|
---|
619 | }
|
---|
620 |
|
---|
621 | for (Place pp : _places)
|
---|
622 | {
|
---|
623 | if(p.isIn(pp) && (topScore.getScore() - pp.getScore()) <= (int)(topScore.getScore() * 0.1 * _parentLimitPercentage))
|
---|
624 | {
|
---|
625 | p.setScore((int)(p.getScore() + (pp.getScore() * _parentBonusPercentage)));
|
---|
626 | }
|
---|
627 | }
|
---|
628 | }
|
---|
629 | }
|
---|
630 |
|
---|
631 | /**
|
---|
632 | * Adds one to the score of the given place
|
---|
633 | *
|
---|
634 | * @param p
|
---|
635 | * is the place to add one to the score of
|
---|
636 | */
|
---|
637 | public void addScore(Place p, Integer scoreToAdd)
|
---|
638 | {
|
---|
639 | if (p == null)
|
---|
640 | {
|
---|
641 | return;
|
---|
642 | }
|
---|
643 |
|
---|
644 | // If there is already a score for this key the increase it by one
|
---|
645 | if (_places.contains(p))
|
---|
646 | {
|
---|
647 | Place place = _places.get(_places.indexOf(p));
|
---|
648 | place.setScore(place.getScore() + scoreToAdd);
|
---|
649 | }
|
---|
650 | // If there is no score for this key then make one
|
---|
651 | else
|
---|
652 | {
|
---|
653 | p.setScore(scoreToAdd);
|
---|
654 | _places.add(p);
|
---|
655 |
|
---|
656 | if(_placeNameMap.containsKey(p.getName()))
|
---|
657 | {
|
---|
658 | _placeNameMap.get(p.getName()).add(p);
|
---|
659 | }
|
---|
660 | else
|
---|
661 | {
|
---|
662 | ArrayList<Place> placeList = new ArrayList<Place>();
|
---|
663 | _placeNameMap.put(p.getName(), placeList);
|
---|
664 | }
|
---|
665 | }
|
---|
666 |
|
---|
667 | // Add to the parent's score too (if there is one)
|
---|
668 | if (!(p.getParentPlaceName() == null))
|
---|
669 | {
|
---|
670 | // If there is an ancestor then add to it's score as well
|
---|
671 | if (p.getParentPlaceName().contains(", "))
|
---|
672 | {
|
---|
673 | String[] places = p.getParentPlaceName().split(", ");
|
---|
674 |
|
---|
675 | ArrayList<Place> specificPlaces = null;
|
---|
676 |
|
---|
677 | if (places.length == 3)
|
---|
678 | {
|
---|
679 | specificPlaces = PlaceInformation.getSpecificPlace(places[0], places[1] + ", " + places[2]);
|
---|
680 | }
|
---|
681 | else
|
---|
682 | {
|
---|
683 | specificPlaces = PlaceInformation.getSpecificPlace(places[0], places[1]);
|
---|
684 | }
|
---|
685 |
|
---|
686 | if (specificPlaces == null)
|
---|
687 | {
|
---|
688 | return;
|
---|
689 | }
|
---|
690 |
|
---|
691 | for (Place pp : specificPlaces)
|
---|
692 | {
|
---|
693 | addScore(pp, (int) (scoreToAdd * _indirectReferencePenaltyPercentage));
|
---|
694 | }
|
---|
695 | }
|
---|
696 | // Otherwise just add the parent
|
---|
697 | else
|
---|
698 | {
|
---|
699 | ArrayList<Place> specificPlaces = PlaceInformation.getSpecificPlace(p.getParentPlaceName(), null);
|
---|
700 |
|
---|
701 | if (specificPlaces == null)
|
---|
702 | {
|
---|
703 | return;
|
---|
704 | }
|
---|
705 |
|
---|
706 | for (Place pp : specificPlaces)
|
---|
707 | {
|
---|
708 | addScore(pp, (int) (scoreToAdd * _indirectReferencePenaltyPercentage));
|
---|
709 | }
|
---|
710 | }
|
---|
711 | }
|
---|
712 | }
|
---|
713 |
|
---|
714 |
|
---|
715 | public void sortScores()
|
---|
716 | {
|
---|
717 | ArrayList<Place> sortedPlaces = new ArrayList<Place>();
|
---|
718 |
|
---|
719 | while (_places.size() > 0)
|
---|
720 | {
|
---|
721 | int index = -1;
|
---|
722 | for (int j = 0; j < _places.size(); j++)
|
---|
723 | {
|
---|
724 | if (index == -1 || _places.get(j).getScore() > _places.get(index).getScore())
|
---|
725 | {
|
---|
726 | index = j;
|
---|
727 | }
|
---|
728 | }
|
---|
729 |
|
---|
730 | sortedPlaces.add(_places.remove(index));
|
---|
731 | }
|
---|
732 |
|
---|
733 | _places = sortedPlaces;
|
---|
734 | }
|
---|
735 |
|
---|
736 | public ArrayList<Place> getPlacesWithParams(double maxScorePercentage, double minScorePercentage, long minPopulation, boolean locality, boolean region, boolean country, int numberOfPlacesToGet)
|
---|
737 | {
|
---|
738 | // System.out.println("Getting places with params");
|
---|
739 |
|
---|
740 | int topScore = _places.get(0).getScore();
|
---|
741 | int minScore = (int) (topScore * minScorePercentage);
|
---|
742 | int maxScore = (int) (topScore * maxScorePercentage);
|
---|
743 |
|
---|
744 | // System.out.println("minScore = " + minScore);
|
---|
745 | // System.out.println("maxScore = " + maxScore);
|
---|
746 |
|
---|
747 | ArrayList<Place> matchingPlaces = new ArrayList<Place>();
|
---|
748 |
|
---|
749 | // Go through all the markers
|
---|
750 | for (Place p : _places)
|
---|
751 | {
|
---|
752 | // System.out.println("Testing place -> " + p.getName());
|
---|
753 | if (p == null)
|
---|
754 | {
|
---|
755 | // System.out.println("P is null?");
|
---|
756 | continue;
|
---|
757 | }
|
---|
758 |
|
---|
759 | // If the place meets the criteria
|
---|
760 | if (p.getScore() > minScore && p.getScore() <= maxScore && p.getPopulation() > minPopulation && ((locality && p.getPlaceType().equals("locality")) || (region && p.getPlaceType().equals("region")) || (country && p.getPlaceType().equals("country"))))
|
---|
761 | {
|
---|
762 | // System.out.println("MATCH!");
|
---|
763 | // If there is not already the maximum amount of visible places
|
---|
764 | // then add this
|
---|
765 | if (matchingPlaces.size() < numberOfPlacesToGet)
|
---|
766 | {
|
---|
767 | matchingPlaces.add(p);
|
---|
768 | }
|
---|
769 | // If there is already MAXVISIBLE visible places then see if
|
---|
770 | // this place should replace one
|
---|
771 | else
|
---|
772 | {
|
---|
773 | Place minScorePlace = null;
|
---|
774 | for (Place q : matchingPlaces)
|
---|
775 | {
|
---|
776 | if (minScorePlace == null)
|
---|
777 | {
|
---|
778 | minScorePlace = q;
|
---|
779 | }
|
---|
780 | if (q.getScore() < minScorePlace.getScore())
|
---|
781 | {
|
---|
782 | minScorePlace = q;
|
---|
783 | }
|
---|
784 | }
|
---|
785 |
|
---|
786 | if (p.getScore() > minScorePlace.getScore())
|
---|
787 | {
|
---|
788 | matchingPlaces.remove(minScorePlace);
|
---|
789 | matchingPlaces.add(p);
|
---|
790 | }
|
---|
791 | }
|
---|
792 | }
|
---|
793 | else
|
---|
794 | {
|
---|
795 | // System.out.println("NOT A MATCH");
|
---|
796 | }
|
---|
797 | }
|
---|
798 | return matchingPlaces;
|
---|
799 | }
|
---|
800 |
|
---|
801 | public void setScoringParams(double penaltyPercentage, double parentBonusPercentage, double indirectReferencePenaltyPercentage)
|
---|
802 | {
|
---|
803 | _penaltyPercentage = penaltyPercentage;
|
---|
804 | _parentBonusPercentage = parentBonusPercentage;
|
---|
805 | _indirectReferencePenaltyPercentage = indirectReferencePenaltyPercentage;
|
---|
806 | }
|
---|
807 |
|
---|
808 | public void setScoringParams(ScanConfiguration config)
|
---|
809 | {
|
---|
810 | _penaltyPercentage = config.getPenalty();
|
---|
811 | _parentBonusPercentage = config.getParentBonus();
|
---|
812 | _indirectReferencePenaltyPercentage = config.getIndirectReferencePenalty();
|
---|
813 | _parentLimitPercentage = config.getParentLimit();
|
---|
814 | }
|
---|
815 |
|
---|
816 | // public HashMap<String, Integer> wordCount(String fileName)
|
---|
817 | // {
|
---|
818 | // HashMap<String, Integer> wordCountMap = new HashMap<String, Integer>();
|
---|
819 | //
|
---|
820 | // try
|
---|
821 | // {
|
---|
822 | // BufferedReader file = new BufferedReader(new FileReader(fileName));
|
---|
823 | //
|
---|
824 | // StringBuilder currentWord = new StringBuilder();
|
---|
825 | //
|
---|
826 | // String line = "";
|
---|
827 | //
|
---|
828 | // ArrayList<String> words = new ArrayList<String>();
|
---|
829 | //
|
---|
830 | // //System.out.print("Finding words... ");
|
---|
831 | // while((line = file.readLine()) != null)
|
---|
832 | // {
|
---|
833 | // words.addAll(MarkupService.findWords(line));
|
---|
834 | // }
|
---|
835 | // //System.out.println("Done!");
|
---|
836 | //
|
---|
837 | // //System.out.print("Adding up scores... ");
|
---|
838 | // for(int j = 0; j < words.size(); j++)
|
---|
839 | // {
|
---|
840 | // if(Character.isLowerCase(words.get(j).charAt(0)))
|
---|
841 | // {
|
---|
842 | // //continue;
|
---|
843 | // }
|
---|
844 | //
|
---|
845 | // currentWord.append(words.get(j));
|
---|
846 | //
|
---|
847 | // int count = 1;
|
---|
848 | // while(_gazetteer.checkPlaceName(currentWord.toString()) != -1)
|
---|
849 | // {
|
---|
850 | // if(_gazetteer.checkPlaceName(currentWord.toString()) == 1)
|
---|
851 | // {
|
---|
852 | // if(wordCountMap.containsKey(currentWord.toString()))
|
---|
853 | // {
|
---|
854 | // Integer i = wordCountMap.get(currentWord.toString());
|
---|
855 | // wordCountMap.put(currentWord.toString(), ++i);
|
---|
856 | // }
|
---|
857 | // else
|
---|
858 | // {
|
---|
859 | // wordCountMap.put(currentWord.toString(), 1);
|
---|
860 | // }
|
---|
861 | // }
|
---|
862 | // currentWord.append(" " + words.get(j + count++));
|
---|
863 | // }
|
---|
864 | //
|
---|
865 | // currentWord.delete(0, currentWord.length());
|
---|
866 | // }
|
---|
867 | // }
|
---|
868 | // catch(Exception ex)
|
---|
869 | // {
|
---|
870 | // ex.printStackTrace();
|
---|
871 | // }
|
---|
872 | //
|
---|
873 | // return wordCountMap;
|
---|
874 | // }
|
---|
875 |
|
---|
876 | public boolean isGazetteerLoaded()
|
---|
877 | {
|
---|
878 | return _gazetteer != null;
|
---|
879 | }
|
---|
880 |
|
---|
881 | public ArrayList<ArrayList<Place>> examineMultipleTexts(ArrayList<String> texts)
|
---|
882 | {
|
---|
883 | ArrayList<ArrayList<Place>> multipleResults = new ArrayList<ArrayList<Place>>();
|
---|
884 | for (String text : texts)
|
---|
885 | {
|
---|
886 | if (text != null)
|
---|
887 | {
|
---|
888 | this.examineTextWithGate(text, null);
|
---|
889 |
|
---|
890 | if(_places.size() > 0)
|
---|
891 | {
|
---|
892 | multipleResults.add(new ArrayList<Place>(_places));
|
---|
893 | }
|
---|
894 | else
|
---|
895 | {
|
---|
896 | multipleResults.add(null);
|
---|
897 | }
|
---|
898 | }
|
---|
899 | else
|
---|
900 | {
|
---|
901 | multipleResults.add(null);
|
---|
902 | }
|
---|
903 | }
|
---|
904 |
|
---|
905 | return multipleResults;
|
---|
906 | }
|
---|
907 |
|
---|
908 | public ArrayList<Place> getPlaces()
|
---|
909 | {
|
---|
910 | return _places;
|
---|
911 | }
|
---|
912 |
|
---|
913 | public void clearPlaces()
|
---|
914 | {
|
---|
915 | _places.clear();
|
---|
916 | }
|
---|
917 | }
|
---|
918 |
|
---|
919 | /*
|
---|
920 | *
|
---|
921 | * // Halve the scores for places that do not have their parents
|
---|
922 | // mentioned
|
---|
923 | // ********************************************************************
|
---|
924 |
|
---|
925 | String[] parentPlaceNames = p.getParentPlaceName().split(", ");
|
---|
926 | if (parentPlaceNames.length == 1)
|
---|
927 | {
|
---|
928 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], null);
|
---|
929 | Place parent = null;
|
---|
930 |
|
---|
931 | if (parentList != null && parentList.size() > 0)
|
---|
932 | {
|
---|
933 | parent = parentList.get(0);
|
---|
934 |
|
---|
935 | if (!_places.contains(parent) || !_places.get(_places.indexOf(parent)).isDirectlyReferenced())
|
---|
936 | {
|
---|
937 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
938 | }
|
---|
939 | }
|
---|
940 | }
|
---|
941 | else if (parentPlaceNames.length == 2)
|
---|
942 | {
|
---|
943 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1]);
|
---|
944 | ArrayList<Place> ancestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], null);
|
---|
945 |
|
---|
946 | Place parent = null;
|
---|
947 | Place ancestor = null;
|
---|
948 |
|
---|
949 | if (parentList != null && parentList.size() > 0)
|
---|
950 | {
|
---|
951 | parent = parentList.get(0);
|
---|
952 |
|
---|
953 | if (!_places.contains(parent) || !_places.get(_places.indexOf(parent)).isDirectlyReferenced())
|
---|
954 | {
|
---|
955 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
956 | }
|
---|
957 | }
|
---|
958 |
|
---|
959 | if (ancestorList != null && ancestorList.size() > 0)
|
---|
960 | {
|
---|
961 | ancestor = ancestorList.get(0);
|
---|
962 |
|
---|
963 | if (!_places.contains(ancestor) || !_places.get(_places.indexOf(ancestor)).isDirectlyReferenced())
|
---|
964 | {
|
---|
965 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
966 | }
|
---|
967 | }
|
---|
968 | }
|
---|
969 | else if (parentPlaceNames.length == 3)
|
---|
970 | {
|
---|
971 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1] + ", " + parentPlaceNames[2]);
|
---|
972 | ArrayList<Place> firstAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], parentPlaceNames[2]);
|
---|
973 | ArrayList<Place> secondAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[2], null);
|
---|
974 |
|
---|
975 | Place parent = null;
|
---|
976 | Place firstAncestor = null;
|
---|
977 | Place secondAncestor = null;
|
---|
978 |
|
---|
979 | if (parentList != null && parentList.size() > 0)
|
---|
980 | {
|
---|
981 | parent = parentList.get(0);
|
---|
982 |
|
---|
983 | if (!_places.contains(parent) || !_places.get(_places.indexOf(parent)).isDirectlyReferenced())
|
---|
984 | {
|
---|
985 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
986 | }
|
---|
987 | }
|
---|
988 |
|
---|
989 | if (firstAncestorList != null && firstAncestorList.size() > 0)
|
---|
990 | {
|
---|
991 | firstAncestor = firstAncestorList.get(0);
|
---|
992 |
|
---|
993 | if (!_places.contains(firstAncestor) || !_places.get(_places.indexOf(firstAncestor)).isDirectlyReferenced())
|
---|
994 | {
|
---|
995 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
996 | }
|
---|
997 | }
|
---|
998 |
|
---|
999 | if (secondAncestorList != null && secondAncestorList.size() > 0)
|
---|
1000 | {
|
---|
1001 | secondAncestor = secondAncestorList.get(0);
|
---|
1002 |
|
---|
1003 | if (!_places.contains(secondAncestor) || !_places.get(_places.indexOf(secondAncestor)).isDirectlyReferenced())
|
---|
1004 | {
|
---|
1005 | p.setScore((int) (p.getScore() * (1 - _penaltyPercentage)));
|
---|
1006 | }
|
---|
1007 | }
|
---|
1008 | }
|
---|
1009 | */
|
---|
1010 | // Add part of the parent's score to the child
|
---|
1011 | // *******************************************
|
---|
1012 | /*
|
---|
1013 | if (parentPlaceNames.length == 1)
|
---|
1014 | {
|
---|
1015 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], null);
|
---|
1016 |
|
---|
1017 | Place parent = null;
|
---|
1018 |
|
---|
1019 | if (parentList != null && parentList.size() > 0)
|
---|
1020 | {
|
---|
1021 | parent = parentList.get(0);
|
---|
1022 |
|
---|
1023 | if (_places.contains(parent))
|
---|
1024 | {
|
---|
1025 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
1026 | }
|
---|
1027 | }
|
---|
1028 | }
|
---|
1029 | else if (parentPlaceNames.length == 2)
|
---|
1030 | {
|
---|
1031 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1]);
|
---|
1032 | ArrayList<Place> ancestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], null);
|
---|
1033 |
|
---|
1034 | Place parent = null;
|
---|
1035 | Place ancestor = null;
|
---|
1036 |
|
---|
1037 | if (parentList != null && parentList.size() > 0)
|
---|
1038 | {
|
---|
1039 | parent = parentList.get(0);
|
---|
1040 |
|
---|
1041 | if (_places.contains(parent))
|
---|
1042 | {
|
---|
1043 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
1044 | }
|
---|
1045 | }
|
---|
1046 |
|
---|
1047 | if (ancestorList != null && ancestorList.size() > 0)
|
---|
1048 | {
|
---|
1049 | ancestor = ancestorList.get(0);
|
---|
1050 |
|
---|
1051 | if (_places.contains(ancestor))
|
---|
1052 | {
|
---|
1053 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(ancestor)).getScore() * _parentBonusPercentage));
|
---|
1054 | }
|
---|
1055 | }
|
---|
1056 | }
|
---|
1057 | else if (parentPlaceNames.length == 3)
|
---|
1058 | {
|
---|
1059 | ArrayList<Place> parentList = PlaceInformation.getSpecificPlace(parentPlaceNames[0], parentPlaceNames[1] + ", " + parentPlaceNames[2]);
|
---|
1060 | ArrayList<Place> firstAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[1], parentPlaceNames[2]);
|
---|
1061 | ArrayList<Place> secondAncestorList = PlaceInformation.getSpecificPlace(parentPlaceNames[2], null);
|
---|
1062 |
|
---|
1063 | Place parent = null;
|
---|
1064 | Place firstAncestor = null;
|
---|
1065 | Place secondAncestor = null;
|
---|
1066 |
|
---|
1067 | if (parentList != null && parentList.size() > 0)
|
---|
1068 | {
|
---|
1069 | parent = parentList.get(0);
|
---|
1070 |
|
---|
1071 | if (_places.contains(parent))
|
---|
1072 | {
|
---|
1073 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(parent)).getScore() * _parentBonusPercentage));
|
---|
1074 | }
|
---|
1075 | }
|
---|
1076 |
|
---|
1077 | if (firstAncestorList != null && firstAncestorList.size() > 0)
|
---|
1078 | {
|
---|
1079 | firstAncestor = firstAncestorList.get(0);
|
---|
1080 |
|
---|
1081 | if (_places.contains(firstAncestor))
|
---|
1082 | {
|
---|
1083 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(firstAncestor)).getScore() * _parentBonusPercentage));
|
---|
1084 | }
|
---|
1085 | }
|
---|
1086 |
|
---|
1087 | if (secondAncestorList != null && secondAncestorList.size() > 0)
|
---|
1088 | {
|
---|
1089 | secondAncestor = secondAncestorList.get(0);
|
---|
1090 |
|
---|
1091 | if (_places.contains(secondAncestor))
|
---|
1092 | {
|
---|
1093 | p.setScore(p.getScore() + (int) (_places.get(_places.indexOf(secondAncestor)).getScore() * _parentBonusPercentage));
|
---|
1094 | }
|
---|
1095 | }
|
---|
1096 | }
|
---|
1097 | */ |
---|