1 | /*
|
---|
2 | * This class dose all the hard work. I will look up the input token in the MonogramModel and return.
|
---|
3 | */
|
---|
4 | package monogram.restorer;
|
---|
5 |
|
---|
6 | import monogram.model.MonogramFactory;
|
---|
7 | import monogram.model.MonogramModel;
|
---|
8 | import monogram.model.ListModel;
|
---|
9 | import util.StringUtil;
|
---|
10 |
|
---|
11 | /**
|
---|
12 | * @author University of Waikato - Te Whare WÄnanga o Waikato
|
---|
13 | * @version 1.0
|
---|
14 | * @since 2014-11-20
|
---|
15 | */
|
---|
16 | public class MonogramRestorer {
|
---|
17 |
|
---|
18 | private boolean preserveMacrons;
|
---|
19 | private MonogramModel macronModel;
|
---|
20 | private MonogramModel doubleVowelModel;
|
---|
21 | private ListModel blackListModel;
|
---|
22 |
|
---|
23 | private String previousToken;
|
---|
24 | private String currentToken;
|
---|
25 | private Selector selector;
|
---|
26 |
|
---|
27 | public MonogramRestorer(boolean preserveMacrons) {
|
---|
28 | this.preserveMacrons = preserveMacrons;
|
---|
29 | macronModel = MonogramFactory.getMacronModel();
|
---|
30 | doubleVowelModel = MonogramFactory.getDoubleVowelModel();
|
---|
31 | blackListModel = MonogramFactory.getBlackList();
|
---|
32 | previousToken = "";
|
---|
33 | selector = new Selector(3);
|
---|
34 | }
|
---|
35 |
|
---|
36 | public String restore(String token, Boolean markupChangedWords) {
|
---|
37 | // Test to see if it should preserve the macrons already in the input texts. If
|
---|
38 | // not remove all macrons.
|
---|
39 | if (!preserveMacrons && StringUtil.containsAccents(token)) {
|
---|
40 | token = StringUtil.removeAccents(token);
|
---|
41 | }
|
---|
42 |
|
---|
43 | // Make lower case copy of token.
|
---|
44 | final String tokenLowerCase = token.toLowerCase();
|
---|
45 |
|
---|
46 | String restoredToken = token;
|
---|
47 | // If the word is in the black list return the token with no macron.
|
---|
48 | if (blackListModel.contains(token)) {
|
---|
49 | return restoredToken;
|
---|
50 | }
|
---|
51 | // Is selector > 0 it is on a sequence
|
---|
52 | if (selector.isMacronTokenSequence()) {
|
---|
53 | if (restoreByMacronModel(tokenLowerCase)) {
|
---|
54 | // Restore capitalization
|
---|
55 | restoredToken = StringUtil.copyCapitalization(token, currentToken);
|
---|
56 | // If restoredToken is not the same as token then macron/s have been add so add
|
---|
57 | // <mark> html. This is the yellow background around the words on the front end.
|
---|
58 | if (!restoredToken.equals(token)) {
|
---|
59 | // only add <mark> html tag if it directinput and not being output to a file.
|
---|
60 | if (markupChangedWords) {
|
---|
61 | restoredToken = "<mark>" + restoredToken + "</mark> ";
|
---|
62 | }
|
---|
63 | }
|
---|
64 |
|
---|
65 | // if the tonken can not be restored by the MacronModel try and restor it with
|
---|
66 | // the DoubleVowelModel.
|
---|
67 | } else if (restoreByDoubleVowelModel(tokenLowerCase)) {
|
---|
68 | restoredToken = StringUtil.copyDVowelCapitalization(token, currentToken);
|
---|
69 | } else {
|
---|
70 | currentToken = tokenLowerCase;
|
---|
71 | }
|
---|
72 | }
|
---|
73 | // else if it is not in a sequence
|
---|
74 | else {
|
---|
75 | // Try and restor it with the DoubleVowelModel
|
---|
76 | if (restoreByDoubleVowelModel(tokenLowerCase)) {
|
---|
77 | restoredToken = StringUtil.copyDVowelCapitalization(token, currentToken);
|
---|
78 | }
|
---|
79 | // try and restor it with the MacronModel
|
---|
80 | else if (restoreByMacronModel(tokenLowerCase)) {
|
---|
81 | restoredToken = StringUtil.copyCapitalization(token, currentToken);
|
---|
82 | }
|
---|
83 | // Dose not need to be restored
|
---|
84 | else {
|
---|
85 | currentToken = tokenLowerCase;
|
---|
86 | }
|
---|
87 | }
|
---|
88 | previousToken = currentToken;
|
---|
89 | return restoredToken;
|
---|
90 | }
|
---|
91 |
|
---|
92 | // trys to restore by using the MacronModel and return boolean if it has.
|
---|
93 | private boolean restoreByMacronModel(String token) {
|
---|
94 |
|
---|
95 | // Test to see if token is in the Distinct Transformation hashset.
|
---|
96 | if (macronModel.isDistinctTransformation(token)) {
|
---|
97 | // if token is in Distinct Transformation hashset make the distinct
|
---|
98 | // ransformation then move along the sequence.
|
---|
99 | currentToken = macronModel.getDistinctTransformation(token);
|
---|
100 | selector.incrementMacronTokenSequence();
|
---|
101 | return true;
|
---|
102 | }
|
---|
103 | // If token is in Indistinct Transformation hashset
|
---|
104 | else if (macronModel.isIndistinctTransformation(token)) {
|
---|
105 | final String[] transformations = macronModel.getIndistinctTransformation(token);
|
---|
106 | double maxProbability = Double.MIN_VALUE;
|
---|
107 | String maxToken = token;
|
---|
108 | // loops all transformation and finds the one with the highest probability given
|
---|
109 | // the previous token.
|
---|
110 | for (String transformation : transformations) {
|
---|
111 | // double probability = macronModel.getIndistinctProbability(transformation);
|
---|
112 | double probability = 1.0;
|
---|
113 | // if macron model has a probability for this transformation then get its
|
---|
114 | // probability.
|
---|
115 | if (macronModel.containsMonogramProbability(transformation, previousToken)) {
|
---|
116 | probability *= macronModel.getMonogramProbability(transformation, previousToken);
|
---|
117 | }
|
---|
118 |
|
---|
119 | // if transformation not in macron model then try and find probability in
|
---|
120 | // Indistinct probability
|
---|
121 | if (probability == 1.0) {
|
---|
122 | probability = macronModel.getIndistinctProbability(transformation);
|
---|
123 | }
|
---|
124 |
|
---|
125 | // If probability is better then any before it update maxProbability and
|
---|
126 | // maxToken.
|
---|
127 | if (probability >= maxProbability) {
|
---|
128 | maxProbability = probability;
|
---|
129 | maxToken = transformation;
|
---|
130 | }
|
---|
131 | }
|
---|
132 | currentToken = maxToken;
|
---|
133 | selector.incrementMacronTokenSequence();
|
---|
134 | return true;
|
---|
135 | }
|
---|
136 | return false;
|
---|
137 | }
|
---|
138 |
|
---|
139 | private boolean restoreByDoubleVowelModel(String token) {
|
---|
140 | if (doubleVowelModel.isDistinctTransformation(token)) {
|
---|
141 | try {
|
---|
142 | currentToken = doubleVowelModel.getDistinctTransformation(token);
|
---|
143 | selector.incrementDoubleVowelTokenSequence();
|
---|
144 | return true;
|
---|
145 | } catch (Exception e) {
|
---|
146 | System.err.println("double vowel error 1");
|
---|
147 | System.exit(0);
|
---|
148 | }
|
---|
149 | } else if (doubleVowelModel.isIndistinctTransformation(token)) {
|
---|
150 | String[] transformations = doubleVowelModel.getIndistinctTransformation(token);
|
---|
151 | Double maxProbability = Double.MIN_VALUE;
|
---|
152 | String maxToken = token;
|
---|
153 | for (String transformation : transformations) {
|
---|
154 | // double probability =
|
---|
155 | // doubleVowelModel.getIndistinctProbability(transformation);
|
---|
156 |
|
---|
157 | double probability = 1.0;
|
---|
158 | if (doubleVowelModel.containsMonogramProbability(transformation, previousToken)) {
|
---|
159 | probability *= doubleVowelModel.getMonogramProbability(transformation, previousToken);
|
---|
160 | }
|
---|
161 | if (probability == 1.0) {
|
---|
162 | probability = doubleVowelModel.getIndistinctProbability(transformation);
|
---|
163 | }
|
---|
164 | if (probability >= maxProbability) {
|
---|
165 | maxProbability = probability;
|
---|
166 | maxToken = transformation;
|
---|
167 | }
|
---|
168 | }
|
---|
169 | currentToken = maxToken;
|
---|
170 | selector.incrementDoubleVowelTokenSequence();
|
---|
171 | return true;
|
---|
172 | }
|
---|
173 | return false;
|
---|
174 | }
|
---|
175 |
|
---|
176 | private class Selector {
|
---|
177 |
|
---|
178 | private int max;
|
---|
179 | private int current;
|
---|
180 |
|
---|
181 | public Selector(int max) {
|
---|
182 | this.max = max;
|
---|
183 | current = max;
|
---|
184 | }
|
---|
185 |
|
---|
186 | public void incrementMacronTokenSequence() {
|
---|
187 | if (current < max) {
|
---|
188 | current++;
|
---|
189 | }
|
---|
190 | }
|
---|
191 |
|
---|
192 | public void incrementDoubleVowelTokenSequence() {
|
---|
193 | if (current > 0) {
|
---|
194 | current--;
|
---|
195 | }
|
---|
196 | }
|
---|
197 |
|
---|
198 | public boolean isMacronTokenSequence() {
|
---|
199 | return current > 0;
|
---|
200 | }
|
---|
201 |
|
---|
202 | // public boolean isDoubleVowelTokenSequence() {
|
---|
203 | // return current == 0;
|
---|
204 | // }
|
---|
205 | }
|
---|
206 | }
|
---|