source: trunk/gsdl/packages/kea/kea-3.0/KEAPhraseFilter.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.3 KB
Line 
1/*
2 * KEAPhraseFilter.java
3 * Copyright (C) 2001 Eibe Frank
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20import java.util.*;
21
22import weka.core.*;
23import weka.filters.*;
24
25/**
26 * This filter splits the text in selected string
27 * attributes into phrases. The resulting
28 * string attributes contain these phrases
29 * separated by '\n' characters.
30 *
31 * Phrases are identified according to the
32 * following definitions:
33 *
34 * A phrase is a sequence of words interrupted
35 * only by sequences of whitespace characters,
36 * where each sequence of whitespace characters
37 * contains at most one '\n'.
38 *
39 * A word is a sequence of letters or digits
40 * that contains at least one letter, with
41 * the following exceptions:
42 *
43 * a) '.', '@', '_', '&', '/', '-' are allowed
44 * if surrounded by letters or digits,
45 *
46 * b) '\'' is allowed if preceeded by a letter
47 * or digit,
48 *
49 * c) '-', '/' are also allowed if succeeded by
50 * whitespace characters followed by another
51 * word. In that case the whitespace characters
52 * will be deleted.
53 *
54 * @author Eibe Frank ([email protected])
55 * @version 1.0
56 */
57public class KEAPhraseFilter extends Filter implements OptionHandler {
58
59 /** Stores which columns to select as a funky range */
60 protected Range m_SelectCols = new Range();
61
62 /** Determines whether internal periods are allowed */
63 protected boolean m_DisallowInternalPeriods = false;
64
65 /**
66 * Returns a string describing this filter
67 *
68 * @return a description of the filter suitable for
69 * displaying in the explorer/experimenter gui
70 */
71 public String globalInfo() {
72 return "This filter splits the text contained " +
73 "by the selected string attributes into phrases.";
74 }
75
76 /**
77 * Returns an enumeration describing the available options
78 *
79 * @return an enumeration of all the available options
80 */
81 public Enumeration listOptions() {
82
83 Vector newVector = new Vector(3);
84
85 newVector.addElement(new Option(
86 "\tSpecify list of attributes to process. First and last are valid\n"
87 +"\tindexes. (default none)",
88 "R", 1, "-R <index1,index2-index4,...>"));
89 newVector.addElement(new Option(
90 "\tInvert matching sense",
91 "V", 0, "-V"));
92 newVector.addElement(new Option(
93 "\tDisallow internal periods",
94 "P", 0, "-P"));
95
96 return newVector.elements();
97 }
98
99 /**
100 * Parses a given list of options controlling the behaviour of this object.
101 * Valid options are:<p>
102 *
103 * -R index1,index2-index4,...<br>
104 * Specify list of attributes to process. First and last are valid indexes.
105 * (default none)<p>
106 *
107 * -V<br>
108 * Invert matching sense <p>
109 *
110 * -P<br>
111 * Disallow internal periods <p>
112 *
113 * @param options the list of options as an array of strings
114 * @exception Exception if an option is not supported
115 */
116 public void setOptions(String[] options) throws Exception {
117
118 String list = Utils.getOption('R', options);
119 if (list.length() != 0) {
120 setAttributeIndices(list);
121 }
122 setInvertSelection(Utils.getFlag('V', options));
123
124 setDisallowInternalPeriods(Utils.getFlag('P', options));
125
126 if (getInputFormat() != null) {
127 setInputFormat(getInputFormat());
128 }
129 }
130
131 /**
132 * Gets the current settings of the filter.
133 *
134 * @return an array of strings suitable for passing to setOptions
135 */
136 public String [] getOptions() {
137
138 String [] options = new String [4];
139 int current = 0;
140
141 if (getInvertSelection()) {
142 options[current++] = "-V";
143 }
144 if (getDisallowInternalPeriods()) {
145 options[current++] = "-P";
146 }
147 if (!getAttributeIndices().equals("")) {
148 options[current++] = "-R"; options[current++] = getAttributeIndices();
149 }
150
151 while (current < options.length) {
152 options[current++] = "";
153 }
154 return options;
155 }
156
157 /**
158 * Sets the format of the input instances.
159 *
160 * @param instanceInfo an Instances object containing the input
161 * instance structure (any instances contained in the object are
162 * ignored - only the structure is required).
163 * @return true if the outputFormat may be collected immediately
164 */
165 public boolean setInputFormat(Instances instanceInfo) throws Exception {
166
167 super.setInputFormat(instanceInfo);
168 setOutputFormat(instanceInfo);
169 m_SelectCols.setUpper(instanceInfo.numAttributes() - 1);
170
171 return true;
172 }
173
174 /**
175 * Input an instance for filtering. Ordinarily the instance is processed
176 * and made available for output immediately. Some filters require all
177 * instances be read before producing output.
178 *
179 * @param instance the input instance
180 * @return true if the filtered instance may now be
181 * collected with output().
182 * @exception Exception if the input instance was not of the correct
183 * format or if there was a problem with the filtering.
184 */
185 public boolean input(Instance instance) throws Exception {
186
187 if (getInputFormat() == null) {
188 throw new Exception("No input instance format defined");
189 }
190 if (m_NewBatch) {
191 resetQueue();
192 m_NewBatch = false;
193 }
194 convertInstance(instance);
195 return true;
196 }
197
198 /**
199 * Signify that this batch of input to the filter is finished. If
200 * the filter requires all instances prior to filtering, output()
201 * may now be called to retrieve the filtered instances. Any
202 * subsequent instances filtered should be filtered based on setting
203 * obtained from the first batch (unless the inputFormat has been
204 * re-assigned or new options have been set). This default
205 * implementation assumes all instance processing occurs during
206 * inputFormat() and input().
207 *
208 * @return true if there are instances pending output
209 * @exception NullPointerException if no input structure has been defined,
210 * @exception Exception if there was a problem finishing the batch.
211 */
212 public boolean batchFinished() throws Exception {
213
214 if (getInputFormat() == null) {
215 throw new NullPointerException("No input instance format defined");
216 }
217 m_NewBatch = true;
218 return (numPendingOutput() != 0);
219 }
220
221 /**
222 * Main method for testing this class.
223 *
224 * @param argv should contain arguments to the filter: use -h for help
225 */
226 public static void main(String [] argv) {
227
228 try {
229 if (Utils.getFlag('b', argv)) {
230 Filter.batchFilterFile(new KEAPhraseFilter(), argv);
231 } else {
232 Filter.filterFile(new KEAPhraseFilter(), argv);
233 }
234 } catch (Exception ex) {
235 System.out.println(ex.getMessage());
236 }
237 }
238
239 /**
240 * Converts an instance by removing all non-alphanumeric characters
241 * from its string attribute values.
242 */
243 private void convertInstance(Instance instance) throws Exception {
244
245 double[] instVals = new double[instance.numAttributes()];
246
247 for (int i = 0; i < instance.numAttributes(); i++) {
248 if (!instance.attribute(i).isString() ||
249 instance.isMissing(i)) {
250 instVals[i] = instance.value(i);
251 } else {
252 if (!m_SelectCols.isInRange(i)) {
253 int index = getOutputFormat().attribute(i).
254 addStringValue(instance.stringValue(i));
255 instVals[i] = (double)index;
256 continue;
257 }
258 String str = instance.stringValue(i);
259 StringBuffer resultStr = new StringBuffer();
260 int j = 0;
261 boolean phraseStart = true;
262 boolean seenNewLine = false;
263 boolean haveSeenHyphen = false;
264 boolean haveSeenSlash = false;
265 while (j < str.length()) {
266 boolean isWord = false;
267 boolean potNumber = false;
268 int startj = j;
269 while (j < str.length()) {
270 char ch = str.charAt(j);
271 if (Character.isLetterOrDigit(ch)) {
272 potNumber = true;
273 if (Character.isLetter(ch)) {
274 isWord = true;
275 }
276 j++;
277 } else if ((!m_DisallowInternalPeriods && (ch == '.')) ||
278 (ch == '@') ||
279 (ch == '_') ||
280 (ch == '&') ||
281 (ch == '/') ||
282 (ch == '-')) {
283 if ((j > 0) && (j + 1 < str.length()) &&
284 Character.isLetterOrDigit(str.charAt(j - 1)) &&
285 Character.isLetterOrDigit(str.charAt(j + 1))) {
286 j++;
287 } else {
288 break;
289 }
290 } else if (ch == '\'') {
291 if ((j > 0) &&
292 Character.isLetterOrDigit(str.charAt(j - 1))) {
293 j++;
294 } else {
295 break;
296 }
297 } else {
298 break;
299 }
300 }
301 if (isWord == true) {
302 if (!phraseStart) {
303 if (haveSeenHyphen) {
304 resultStr.append('-');
305 } else if (haveSeenSlash) {
306 resultStr.append('/');
307 } else {
308 resultStr.append(' ');
309 }
310 }
311 resultStr.append(str.substring(startj, j));
312 if (j == str.length()) {
313 break;
314 }
315 phraseStart = false;
316 seenNewLine = false;
317 haveSeenHyphen = false;
318 haveSeenSlash = false;
319 if (Character.isWhitespace(str.charAt(j))) {
320 if (str.charAt(j) == '\n') {
321 seenNewLine = true;
322 }
323 } else if (str.charAt(j) == '-') {
324 haveSeenHyphen = true;
325 } else if (str.charAt(j) == '/') {
326 haveSeenSlash = true;
327 } else {
328 phraseStart = true;
329 resultStr.append('\n');
330 }
331 j++;
332 } else if (j == str.length()) {
333 break;
334 } else if (str.charAt(j) == '\n') {
335 if (seenNewLine) {
336 if (phraseStart == false) {
337 resultStr.append('\n');
338 phraseStart = true;
339 }
340 } else if (potNumber) {
341 if (phraseStart == false) {
342 phraseStart = true;
343 resultStr.append('\n');
344 }
345 }
346 seenNewLine = true;
347 j++;
348 } else if (Character.isWhitespace(str.charAt(j))) {
349 if (potNumber) {
350 if (phraseStart == false) {
351 phraseStart = true;
352 resultStr.append('\n');
353 }
354 }
355 j++;
356 } else {
357 if (phraseStart == false) {
358 resultStr.append('\n');
359 phraseStart = true;
360 }
361 j++;
362 }
363 }
364 int index = getOutputFormat().attribute(i).
365 addStringValue(resultStr.toString());
366 instVals[i] = (double)index;
367 }
368 }
369 Instance inst = new Instance(instance.weight(), instVals);
370 inst.setDataset(getOutputFormat());
371 push(inst);
372 }
373
374 /**
375 * Returns the tip text for this property
376 *
377 * @return tip text for this property suitable for
378 * displaying in the explorer/experimenter gui
379 */
380 public String invertSelectionTipText() {
381
382 return "If set to false, the specified attributes will be processed;"
383 + " If set to true, specified attributes won't be processed.";
384 }
385
386 /**
387 * Get whether the supplied columns are to be processed
388 *
389 * @return true if the supplied columns won't be processed
390 */
391 public boolean getInvertSelection() {
392
393 return m_SelectCols.getInvert();
394 }
395
396 /**
397 * Set whether selected columns should be processed. If true the
398 * selected columns won't be processed.
399 *
400 * @param invert the new invert setting
401 */
402 public void setInvertSelection(boolean invert) {
403
404 m_SelectCols.setInvert(invert);
405 }
406
407 /**
408 * Returns the tip text for this property
409 *
410 * @return tip text for this property suitable for
411 * displaying in the explorer/experimenter gui
412 */
413 public String disallowInternalPeriodsTipText() {
414
415 return "If set to false, internal periods are allowed.";
416 }
417
418 /**
419 * Get whether the supplied columns are to be processed
420 *
421 * @return true if the supplied columns won't be processed
422 */
423 public boolean getDisallowInternalPeriods() {
424
425 return m_DisallowInternalPeriods;
426 }
427
428 /**
429 * Set whether selected columns should be processed. If true the
430 * selected columns won't be processed.
431 *
432 * @param invert the new invert setting
433 */
434 public void setDisallowInternalPeriods(boolean disallow) {
435
436 m_DisallowInternalPeriods = disallow;
437 }
438
439 /**
440 * Returns the tip text for this property
441 *
442 * @return tip text for this property suitable for
443 * displaying in the explorer/experimenter gui
444 */
445 public String attributeIndicesTipText() {
446
447 return "Specify range of attributes to act on."
448 + " This is a comma separated list of attribute indices, with"
449 + " \"first\" and \"last\" valid values. Specify an inclusive"
450 + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
451 }
452
453 /**
454 * Get the current range selection.
455 *
456 * @return a string containing a comma separated list of ranges
457 */
458 public String getAttributeIndices() {
459
460 return m_SelectCols.getRanges();
461 }
462
463 /**
464 * Set which attributes are to be processed
465 *
466 * @param rangeList a string representing the list of attributes. Since
467 * the string will typically come from a user, attributes are indexed from
468 * 1. <br>
469 * eg: first-3,5,6-last
470 */
471 public void setAttributeIndices(String rangeList) {
472
473 m_SelectCols.setRanges(rangeList);
474 }
475
476 /**
477 * Set which attributes are to be processed
478 *
479 * @param attributes an array containing indexes of attributes to select.
480 * Since the array will typically come from a program, attributes are indexed
481 * from 0.
482 */
483 public void setAttributeIndicesArray(int [] attributes) {
484
485 setAttributeIndices(Range.indicesToRangeList(attributes));
486 }
487}
Note: See TracBrowser for help on using the repository browser.