source: trunk/gsdl/packages/kea/kea-3.0/NumbersFilter.java@ 8815

Last change on this file since 8815 was 8815, checked in by mdewsnip, 19 years ago

Kea 3.0, as downloaded from http://www.nzdl.org/kea but with CSTR_abstracts_test, CSTR_abstracts_train, Chinese_test, and Chinese_train directories removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1/*
2 * NumbersFilter.java
3 * Copyright (C) 2000 Eibe Frank
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20import weka.core.*;
21import weka.filters.*;
22
23import java.util.*;
24
25/**
26 * Removes all numbers from all the string attributes in the given
27 * dataset. Assumes that words are separated by whitespace.
28 *
29 * @author Eibe Frank ([email protected])
30 * @version 1.0
31 */
32public class NumbersFilter extends Filter {
33
34 /**
35 * Returns a string describing this filter
36 *
37 * @return a description of the filter suitable for
38 * displaying in the explorer/experimenter gui
39 */
40 public String globalInfo() {
41 return "Removes all numbers from all the string attributes in " +
42 "the given dataset. Assumes that words are separated by whitespace.";
43 }
44
45 /**
46 * Signify that this batch of input to the filter is finished. If
47 * the filter requires all instances prior to filtering, output()
48 * may now be called to retrieve the filtered instances. Any
49 * subsequent instances filtered should be filtered based on setting
50 * obtained from the first batch (unless the inputFormat has been
51 * re-assigned or new options have been set). This default
52 * implementation assumes all instance processing occurs during
53 * inputFormat() and input().
54 *
55 * @return true if there are instances pending output
56 * @exception NullPointerException if no input structure has been defined,
57 * @exception Exception if there was a problem finishing the batch.
58 */
59 public boolean batchFinished() throws Exception {
60
61 if (getInputFormat() == null) {
62 throw new NullPointerException("No input instance format defined");
63 }
64 m_NewBatch = true;
65 return (numPendingOutput() != 0);
66 }
67
68 /**
69 * Sets the format of the input instances.
70 *
71 * @param instanceInfo an Instances object containing the input
72 * instance structure (any instances contained in the object are
73 * ignored - only the structure is required).
74 * @return true if the outputFormat may be collected immediately
75 */
76 public boolean setInputFormat(Instances instanceInfo) throws Exception {
77
78 super.setInputFormat(instanceInfo);
79 setOutputFormat(instanceInfo);
80 return true;
81 }
82
83 /**
84 * Input an instance for filtering. Ordinarily the instance is processed
85 * and made available for output immediately. Some filters require all
86 * instances be read before producing output.
87 *
88 * @param instance the input instance
89 * @return true if the filtered instance may now be
90 * collected with output().
91 * @exception Exception if the input instance was not of the correct
92 * format or if there was a problem with the filtering.
93 */
94 public boolean input(Instance instance) throws Exception {
95
96 if (getInputFormat() == null) {
97 throw new Exception("No input instance format defined");
98 }
99 if (m_NewBatch) {
100 resetQueue();
101 m_NewBatch = false;
102 }
103 convertInstance(instance);
104 return true;
105 }
106
107 /**
108 * Main method for testing this class.
109 *
110 * @param argv should contain arguments to the filter: use -h for help
111 */
112 public static void main(String [] argv) {
113
114 try {
115 if (Utils.getFlag('b', argv)) {
116 Filter.batchFilterFile(new NumbersFilter(), argv);
117 } else {
118 Filter.filterFile(new NumbersFilter(), argv);
119 }
120 } catch (Exception ex) {
121 System.out.println(ex.getMessage());
122 }
123 }
124
125 /**
126 * Converts an instance. A phrase boundary is inserted where
127 * a number is found.
128 */
129 private void convertInstance(Instance instance) throws Exception {
130
131 double[] instVals = new double[instance.numAttributes()];
132
133 for (int i = 0; i < instance.numAttributes(); i++) {
134 if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
135 instVals[i] = instance.value(i);
136 } else {
137 String str = instance.stringValue(i);
138 StringBuffer resultStr = new StringBuffer();
139 StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
140 while (tok.hasMoreTokens()) {
141 String token = tok.nextToken();
142
143 // Everything that doesn't contain at least
144 // one letter is considered to be a number
145 boolean isNumber = true;
146 for (int j = 0; j < token.length(); j++) {
147 if (Character.isLetter(token.charAt(j))) {
148 isNumber = false;
149 break;
150 }
151 }
152 if (!isNumber) {
153 resultStr.append(token);
154 } else {
155 if (token.equals(" ") || token.equals("\t") ||
156 token.equals("\n")) {
157 resultStr.append(token);
158 } else {
159 resultStr.append(" \n ");
160 }
161 }
162 }
163 int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
164 instVals[i] = (double)index;
165 }
166 }
167 Instance inst = new Instance(instance.weight(), instVals);
168 inst.setDataset(getOutputFormat());
169 push(inst);
170 }
171}
172
173
174
175
176
177
178
179
180
181
182
Note: See TracBrowser for help on using the repository browser.