source: other-projects/the-macronizer/trunk/src/java/monogram/restorer/MonogramRestorer.java@ 29855

Last change on this file since 29855 was 29855, checked in by davidb, 9 years ago

John's code after refactoring by Tom over the summer of 2014/2015

File size: 8.0 KB
Line 
1/*
2 * This class dose all the hard work. I will look up the input token in the MonogramModel and return.
3 */
4package monogram.restorer;
5
6import monogram.model.MonogramFactory;
7import monogram.model.MonogramModel;
8import monogram.model.ListModel;
9import util.StringUtil;
10import web.servlets.DirectInput;
11
12/**
13* @author University of Waikato - Te Whare Wānanga o Waikato
14 * @version 1.0
15 * @since 2014-11-20
16 */
17public class MonogramRestorer {
18
19 private boolean preserveMacrons;
20 private MonogramModel macronModel;
21 private MonogramModel doubleVowelModel;
22 private ListModel blackListModel;
23
24 private String previousToken;
25 private String currentToken;
26 private Selector selector;
27
28 public MonogramRestorer(boolean preserveMacrons) {
29 System.out.println("test 9");
30 this.preserveMacrons = preserveMacrons;
31 System.out.println("test 10");
32 macronModel = MonogramFactory.getMacronModel();
33 System.out.println("test 11");
34 doubleVowelModel = MonogramFactory.getDoubleVowelModel();
35 System.out.println("test 12");
36 blackListModel = MonogramFactory.getBlackList();
37 previousToken = "";
38 selector = new Selector(3);
39 System.out.println("test done3");
40 }
41
42 public String restore(String token) {
43
44 // Test to see if it should preserve the macrons already in the input texts. If not remove all macrons.
45 if (!preserveMacrons && StringUtil.containsAccents(token)) {
46 token = StringUtil.removeAccents(token);
47 }
48
49
50 // Make lower case copy of token.
51 final String tokenLowerCase = token.toLowerCase();
52
53 String restoredToken = token;
54 //If the word is in the black list return the token with no macron.
55if(blackListModel.contains(token)){return restoredToken;}
56 // Is selector > 0 it is on a sequence
57 if (selector.isMacronTokenSequence()) {
58 if (restoreByMacronModel(tokenLowerCase)) {
59
60 //Restore capitalization
61 restoredToken = StringUtil.copyCapitalization(token, currentToken);
62 // If restoredToken is not the same as token then macron/s have been add so add <mark> html. This is the yellow background around the words on the front end.
63 if(!restoredToken.equals(token)){
64 // only add <mark> html tag if it directinput and not being output to a file.
65 if(DirectInput.DI){restoredToken = "<mark>"+restoredToken+"</mark> ";}
66 }
67
68 //if the tonken can not be restored by the MacronModel try and restor it with the DoubleVowelModel.
69 } else if (restoreByDoubleVowelModel(tokenLowerCase)) {
70 restoredToken = StringUtil.copyDVowelCapitalization(token, currentToken);
71 } else {
72 currentToken = tokenLowerCase;
73 }
74 }
75 // else if it is not in a sequence
76 else {
77 // Try and restor it with the DoubleVowelModel
78 if (restoreByDoubleVowelModel(tokenLowerCase)) {
79 restoredToken = StringUtil.copyDVowelCapitalization(token, currentToken);
80 }
81 // try and restor it with the MacronModel
82 else if (restoreByMacronModel(tokenLowerCase)) {
83 restoredToken = StringUtil.copyCapitalization(token, currentToken);
84 }
85 // Dose not need to be restored
86 else {
87 currentToken = tokenLowerCase;
88 }
89 }
90 previousToken = currentToken;
91 return restoredToken;
92 }
93
94 // trys to restore by using the MacronModel and return boolean if it has.
95 private boolean restoreByMacronModel(String token) {
96
97 // Test to see if token is in the Distinct Transformation hashset.
98 if (macronModel.isDistinctTransformation(token)) {
99 //if token is in Distinct Transformation hashset make the distinct ransformation then move along the sequence.
100 currentToken = macronModel.getDistinctTransformation(token);
101 selector.incrementMacronTokenSequence();
102 return true;
103 }
104 // If token is in Indistinct Transformation hashset
105 else if (macronModel.isIndistinctTransformation(token)) {
106 final String[] transformations = macronModel.getIndistinctTransformation(token);
107 double maxProbability = Double.MIN_VALUE;
108 String maxToken = token;
109 // loops all transformation and finds the one with the highest probability given the previous token.
110 for (String transformation : transformations) {
111 //double probability = macronModel.getIndistinctProbability(transformation);
112 double probability = 1.0;
113 //if macron model has a probability for this transformation then get its probability.
114 if (macronModel.containsMonogramProbability(transformation, previousToken)) {
115 probability *= macronModel.getMonogramProbability(transformation, previousToken);
116 }
117
118 // if transformation not in macron model then try and find probability in Indistinct probability
119 if (probability == 1.0) {
120 probability = macronModel.getIndistinctProbability(transformation);
121 }
122
123 //If probability is better then any before it update maxProbability and maxToken.
124 if (probability >= maxProbability) {
125 maxProbability = probability;
126 maxToken = transformation;
127 }
128 }
129 currentToken = maxToken;
130 selector.incrementMacronTokenSequence();
131 return true;
132 }
133 return false;
134 }
135
136 private boolean restoreByDoubleVowelModel(String token) {
137 if (doubleVowelModel.isDistinctTransformation(token)) {
138 try {
139 currentToken = doubleVowelModel.getDistinctTransformation(token);
140 selector.incrementDoubleVowelTokenSequence();
141 return true;
142 } catch (Exception e) {
143 System.out.println("double vowel error 1");
144 System.exit(0);
145 }
146 } else if (doubleVowelModel.isIndistinctTransformation(token)) {
147 String[] transformations = doubleVowelModel.getIndistinctTransformation(token);
148 Double maxProbability = Double.MIN_VALUE;
149 String maxToken = token;
150 for (String transformation : transformations) {
151 //double probability = doubleVowelModel.getIndistinctProbability(transformation);
152
153 double probability = 1.0;
154 if (doubleVowelModel.containsMonogramProbability(transformation, previousToken)) {
155 probability *= doubleVowelModel.getMonogramProbability(transformation, previousToken);
156 }
157 if (probability == 1.0) {
158 probability = doubleVowelModel.getIndistinctProbability(transformation);
159 }
160 if (probability >= maxProbability) {
161 maxProbability = probability;
162 maxToken = transformation;
163 }
164 }
165 currentToken = maxToken;
166 selector.incrementDoubleVowelTokenSequence();
167 return true;
168 }
169 return false;
170 }
171
172 private class Selector {
173
174 private int max;
175 private int current;
176
177 public Selector(int max) {
178 this.max = max;
179 current = max;
180 }
181
182 public void incrementMacronTokenSequence() {
183 if (current < max) {
184 current++;
185 }
186 }
187
188 public void incrementDoubleVowelTokenSequence() {
189 if (current > 0) {
190 current--;
191 }
192 }
193
194 public boolean isMacronTokenSequence() {
195 return current > 0;
196 }
197
198 public boolean isDoubleVowelTokenSequence() {
199 return current == 0;
200 }
201 }
202}
Note: See TracBrowser for help on using the repository browser.