1 | /*
|
---|
2 | * KEP.java
|
---|
3 | * Version 1.1
|
---|
4 | *
|
---|
5 | * Kea -- Automatic Keyphrase Extraction
|
---|
6 | * Copyright 1998-1999 by Eibe Frank and Gordon Paynter
|
---|
7 | * Contact [email protected] or [email protected]
|
---|
8 | *
|
---|
9 | * This program is free software; you can redistribute it and/or modify
|
---|
10 | * it under the terms of the GNU General Public License as published by
|
---|
11 | * the Free Software Foundation; either version 2 of the License, or
|
---|
12 | * (at your option) any later version.
|
---|
13 | *
|
---|
14 | * This program is distributed in the hope that it will be useful,
|
---|
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
17 | * GNU General Public License for more details.
|
---|
18 | *
|
---|
19 | * You should have received a copy of the GNU General Public License
|
---|
20 | * along with this program; if not, write to the Free Software
|
---|
21 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
22 | */
|
---|
23 |
|
---|
24 | /*
|
---|
25 | * KEP is the machine learning and post-processing part of KEA,
|
---|
26 | * the Keyphrase Extraction Algorithm.
|
---|
27 | *
|
---|
28 | * KEP assumes that the class is the last attribute and
|
---|
29 | * that keyphrases not found in the text have a
|
---|
30 | * missing class value.
|
---|
31 | *
|
---|
32 | * @author Eibe Frank ([email protected])
|
---|
33 | */
|
---|
34 |
|
---|
35 | /*
|
---|
36 | * Version History
|
---|
37 | *
|
---|
38 | * 1.0 Witten et.al version
|
---|
39 | * 1.1 First distribution
|
---|
40 | * 1.1.1 Bug: when more output requested than there are (filtered)
|
---|
41 | * instances in the arff file, Kea would output garbage
|
---|
42 | * STDOUT output cleaned up in training stage
|
---|
43 | */
|
---|
44 |
|
---|
45 | import java.io.*;
|
---|
46 | import java.util.*;
|
---|
47 | import java.util.zip.*;
|
---|
48 | import jaws.corePackage.*;
|
---|
49 | import jaws.evaluationPackage.*;
|
---|
50 | import jaws.classifierPackage.*;
|
---|
51 | import jaws.filterPackage.*;
|
---|
52 |
|
---|
53 | public class KEP implements Serializable {
|
---|
54 |
|
---|
55 | // ================
|
---|
56 | // Public variables
|
---|
57 | // ================
|
---|
58 |
|
---|
59 | /**
|
---|
60 | * The maximum number of documents in
|
---|
61 | * training and test corpus.
|
---|
62 | */
|
---|
63 |
|
---|
64 | public final static int MAX_NUM_DOCS = 1000;
|
---|
65 |
|
---|
66 | /**
|
---|
67 | * Index of the column with the document name.
|
---|
68 | */
|
---|
69 |
|
---|
70 | public final static int DOC_NUM_INDEX = 1;
|
---|
71 |
|
---|
72 | /**
|
---|
73 | * Number of steps for optimization of
|
---|
74 | * probability cutoff.
|
---|
75 | */
|
---|
76 |
|
---|
77 | public final static int PROB_STEPS = 100;
|
---|
78 |
|
---|
79 | /**
|
---|
80 | * Minimum probability that a phrase
|
---|
81 | * has to achieve.
|
---|
82 | */
|
---|
83 |
|
---|
84 | public final static double MIN_PROB = 0.00;
|
---|
85 |
|
---|
86 | /**
|
---|
87 | * Maximum number of bins for equal
|
---|
88 | * width discretization.
|
---|
89 | */
|
---|
90 |
|
---|
91 | public final static int MAX_BINS = 30;
|
---|
92 |
|
---|
93 | /**
|
---|
94 | * Minimum number of bins for equal
|
---|
95 | * width discretization.
|
---|
96 | */
|
---|
97 |
|
---|
98 | public final static int MIN_BINS = 2;
|
---|
99 |
|
---|
100 | /**
|
---|
101 | * Number of folds for standard
|
---|
102 | * cross-validation.
|
---|
103 | */
|
---|
104 |
|
---|
105 | public final static int NUM_FOLDS = 5;
|
---|
106 |
|
---|
107 | /**
|
---|
108 | * Columns to be discretized.
|
---|
109 | */
|
---|
110 |
|
---|
111 | public final static String DISC_RANGE = "first-last";
|
---|
112 |
|
---|
113 | // =================
|
---|
114 | // Private variables
|
---|
115 | // =================
|
---|
116 |
|
---|
117 | /**
|
---|
118 | * The classifier for determining
|
---|
119 | * keyphrases.
|
---|
120 | */
|
---|
121 |
|
---|
122 | private DistributionClassifier theClassifier = new NaiveBayes();
|
---|
123 |
|
---|
124 | /**
|
---|
125 | * The number of phrases to be output.
|
---|
126 | */
|
---|
127 |
|
---|
128 | private int theNumPhrases = -1;
|
---|
129 |
|
---|
130 | /**
|
---|
131 | * The minimum probability a phrase has
|
---|
132 | * to achieve.
|
---|
133 | */
|
---|
134 |
|
---|
135 | private double theMinProb = -1;
|
---|
136 |
|
---|
137 | /**
|
---|
138 | * Rank cutoff.
|
---|
139 | */
|
---|
140 |
|
---|
141 | private int theRankCutoff;
|
---|
142 |
|
---|
143 | /**
|
---|
144 | * The maximum rank cutoff.
|
---|
145 | */
|
---|
146 |
|
---|
147 | private int theMaxRankCutoff = 20;
|
---|
148 |
|
---|
149 | /**
|
---|
150 | * The minimum rank cutoff.
|
---|
151 | */
|
---|
152 |
|
---|
153 | private int theMinRankCutoff = 1;
|
---|
154 |
|
---|
155 | /**
|
---|
156 | * Probability cutoff.
|
---|
157 | */
|
---|
158 |
|
---|
159 | private double theProbCutoff;
|
---|
160 |
|
---|
161 | /**
|
---|
162 | * The number of bins for equal width discretization.
|
---|
163 | */
|
---|
164 |
|
---|
165 | private int theNumBins = 23;
|
---|
166 |
|
---|
167 | /**
|
---|
168 | * The discretizer built up on the training data.
|
---|
169 | */
|
---|
170 |
|
---|
171 | private DiscretiseFilter theDisc;
|
---|
172 |
|
---|
173 | /**
|
---|
174 | * Equal width discretization?
|
---|
175 | */
|
---|
176 |
|
---|
177 | private boolean theEqual = false;
|
---|
178 |
|
---|
179 | /**
|
---|
180 | * Columns in arff file to be used for building
|
---|
181 | * the classifier.
|
---|
182 | */
|
---|
183 |
|
---|
184 | private String[] theCols = {"tfidf","first_occurrence","class"};
|
---|
185 |
|
---|
186 | /**
|
---|
187 | * Column to be used for subphrase selection.
|
---|
188 | */
|
---|
189 |
|
---|
190 | private String theSubphraseColumnName = "tfidf";
|
---|
191 |
|
---|
192 | /**
|
---|
193 | * Column for subphrase sellection: higher is better
|
---|
194 | */
|
---|
195 |
|
---|
196 | private boolean theSubphraseHi = true;
|
---|
197 |
|
---|
198 | /**
|
---|
199 | * The secondary key for sorting the phrases in
|
---|
200 | * the table (primary key: probability)
|
---|
201 | */
|
---|
202 |
|
---|
203 | private String theSecKeyColumnName = "tfidf";
|
---|
204 |
|
---|
205 | /**
|
---|
206 | * Column for secondary key: higher is better
|
---|
207 | */
|
---|
208 |
|
---|
209 | private boolean theSecKeyHi = true;
|
---|
210 |
|
---|
211 | /**
|
---|
212 | * Column to be used for selecting batches for the
|
---|
213 | * cross-validation.
|
---|
214 | */
|
---|
215 |
|
---|
216 | private String theCVColumnName = "";
|
---|
217 |
|
---|
218 | /**
|
---|
219 | * Use training data instead of cross-validation?
|
---|
220 | */
|
---|
221 |
|
---|
222 | private boolean theUseTrain = true;
|
---|
223 |
|
---|
224 | /**
|
---|
225 | * Use discretization?
|
---|
226 | */
|
---|
227 |
|
---|
228 | private boolean theUseDiscretization = true;
|
---|
229 |
|
---|
230 | // ==============
|
---|
231 | // Public methods
|
---|
232 | // ==============
|
---|
233 |
|
---|
234 | /**
|
---|
235 | * Returns the indices for the given attribute names.
|
---|
236 | * @param name the attributes' names
|
---|
237 | * @return ind the indices
|
---|
238 | * @exception Exception if something goes wrong
|
---|
239 | */
|
---|
240 |
|
---|
241 | private String indices(Instances dataset) {
|
---|
242 |
|
---|
243 | String rangeList = "";
|
---|
244 | for(int i = 0; i < theCols.length; i++)
|
---|
245 | if (i == 0)
|
---|
246 | rangeList = ""+(dataset.attribute(theCols[i]).index() + 1);
|
---|
247 | else
|
---|
248 | rangeList += ","+(dataset.attribute(theCols[i]).index() + 1);
|
---|
249 | return rangeList;
|
---|
250 | }
|
---|
251 |
|
---|
252 | /**
|
---|
253 | * Performs a cross-validation on the given dataset.
|
---|
254 | * @returns the best F-Measure for all probability
|
---|
255 | * and rank cutoffs
|
---|
256 | */
|
---|
257 |
|
---|
258 | private double[][][] crossValidation(Instances instances, int numPos)
|
---|
259 | throws Exception {
|
---|
260 |
|
---|
261 | SelectFilter selFilter;
|
---|
262 | DiscretiseFilter discFilter;
|
---|
263 | DeleteFilter delFilter;
|
---|
264 | Instances train, test, trainDisc, testDisc;
|
---|
265 | int numFolds, indexI, indexP;
|
---|
266 | boolean batchedMode = false;
|
---|
267 | double[][][] results = new double[theMaxRankCutoff][PROB_STEPS][2];
|
---|
268 | double[][][] bestInsts;
|
---|
269 | double[] currentTPandFP;
|
---|
270 | int[] range = new int[1];
|
---|
271 | int theCVColumn = -1;
|
---|
272 |
|
---|
273 | if (theCVColumnName.length() > 0)
|
---|
274 | theCVColumn = instances.attribute(theCVColumnName).index();
|
---|
275 |
|
---|
276 | // Is there more than one document in the dataset?
|
---|
277 |
|
---|
278 | if ((theCVColumn != -1) && (instances.attribute(theCVColumn).numValues() > 1)) {
|
---|
279 | batchedMode = true;
|
---|
280 | numFolds = instances.attribute(theCVColumn).numValues();
|
---|
281 | } else
|
---|
282 | numFolds = NUM_FOLDS;
|
---|
283 |
|
---|
284 | // Perform the cross-validation
|
---|
285 |
|
---|
286 | for (int k = 0; k < numFolds; k++) {
|
---|
287 |
|
---|
288 | System.err.println("Processing fold: "+k);
|
---|
289 |
|
---|
290 | if (batchedMode) {
|
---|
291 |
|
---|
292 | // Batched cross-validation
|
---|
293 |
|
---|
294 | selFilter = new SelectFilter();
|
---|
295 | selFilter.setInvertSelection(true);
|
---|
296 | selFilter.setAttributeIndex(theCVColumn + 1);
|
---|
297 | selFilter.inputFormat(instances);
|
---|
298 |
|
---|
299 | // Generate training set
|
---|
300 |
|
---|
301 | range[0] = k;
|
---|
302 | selFilter.setNominalIndicesArr(range);
|
---|
303 | train = Filter.useFilter(instances, selFilter);
|
---|
304 |
|
---|
305 | // Generate test set
|
---|
306 |
|
---|
307 | selFilter.setInvertSelection(false);
|
---|
308 | test = Filter.useFilter(instances, selFilter);
|
---|
309 | } else {
|
---|
310 |
|
---|
311 | // Standard NUM_FOLDS cross-validation
|
---|
312 |
|
---|
313 | train = instances.trainCV(NUM_FOLDS, k);
|
---|
314 | test = instances.testCV(NUM_FOLDS, k);
|
---|
315 | }
|
---|
316 |
|
---|
317 | System.err.println("Choosing attributes...");
|
---|
318 |
|
---|
319 | // Choose attributes in training and test data
|
---|
320 |
|
---|
321 | delFilter = new DeleteFilter();
|
---|
322 | delFilter.setInvertSelection(true);
|
---|
323 | delFilter.setAttributeIndices(indices(instances));
|
---|
324 | delFilter.inputFormat(instances);
|
---|
325 | trainDisc = Filter.useFilter(train, delFilter);
|
---|
326 | testDisc = Filter.useFilter(test, delFilter);
|
---|
327 |
|
---|
328 | System.err.println("Discretizing...");
|
---|
329 |
|
---|
330 | // Discretise training and test data
|
---|
331 |
|
---|
332 | discFilter = new DiscretiseFilter();
|
---|
333 | discFilter.setAttributeIndices(DISC_RANGE);
|
---|
334 | if (theEqual) {
|
---|
335 | discFilter.setBins(theNumBins);
|
---|
336 | discFilter.setClassIndex(-1);
|
---|
337 | discFilter.setUseMDL(false);
|
---|
338 | } else
|
---|
339 | discFilter.setClassIndex(trainDisc.numAttributes());
|
---|
340 | discFilter.inputFormat(trainDisc);
|
---|
341 | trainDisc = Filter.useFilter(trainDisc, discFilter);
|
---|
342 | testDisc = Filter.useFilter(testDisc, discFilter);
|
---|
343 |
|
---|
344 | System.err.println("Building classifier...");
|
---|
345 |
|
---|
346 | // Build classifier on training data
|
---|
347 |
|
---|
348 | theClassifier.buildClassifier(trainDisc);
|
---|
349 |
|
---|
350 | // Find the most probable keyphrases in the test data
|
---|
351 |
|
---|
352 | System.err.println("Looking for the most probable keyphrases...");
|
---|
353 |
|
---|
354 | bestInsts = new double[MAX_NUM_DOCS][theMaxRankCutoff][2];
|
---|
355 | mostProbablePhrases(test, testDisc, bestInsts);
|
---|
356 |
|
---|
357 | // Compute F-measure for all probability and rank
|
---|
358 | // cutoffs and add to sum.
|
---|
359 |
|
---|
360 | indexI = 0;
|
---|
361 | for (int i = theMinRankCutoff; i <= theMaxRankCutoff; i++) {
|
---|
362 | indexP = 0;
|
---|
363 | for (double p = 1 - (1.0/PROB_STEPS); p >= MIN_PROB;
|
---|
364 | p = p - (1.0/PROB_STEPS)) {
|
---|
365 | currentTPandFP = TPandFP(test, bestInsts, i, p, false);
|
---|
366 | results[indexI][indexP][0] += currentTPandFP[0];
|
---|
367 | results[indexI][indexP][1] += currentTPandFP[1];
|
---|
368 | indexP++;
|
---|
369 | }
|
---|
370 | indexI++;
|
---|
371 | }
|
---|
372 | }
|
---|
373 |
|
---|
374 | return results;
|
---|
375 | }
|
---|
376 |
|
---|
377 | /**
|
---|
378 | * Optimizes the F-measure by choosing the discretization and the
|
---|
379 | * rank and probability cutoffs.
|
---|
380 | */
|
---|
381 |
|
---|
382 | private double optimizeFMeasure(Instances instances, int numPos)
|
---|
383 | throws Exception {
|
---|
384 |
|
---|
385 | double[][][][] results;
|
---|
386 | double currentFMeasure, maxFMeasure = 0;
|
---|
387 | int temp_MAX_BINS = MIN_BINS;
|
---|
388 |
|
---|
389 | if (theEqual)
|
---|
390 | temp_MAX_BINS = MAX_BINS;
|
---|
391 |
|
---|
392 | // Do cross-validation for different number of bins.
|
---|
393 |
|
---|
394 | results =
|
---|
395 | new double[temp_MAX_BINS - MIN_BINS + 1][theMaxRankCutoff][PROB_STEPS][2];
|
---|
396 | for (int b = MIN_BINS; b <= temp_MAX_BINS; b++) {
|
---|
397 | if (theEqual) {
|
---|
398 | System.err.println("Computing Estimate for "+b+" bins");
|
---|
399 | theNumBins = b;
|
---|
400 | }
|
---|
401 | results[b - MIN_BINS] = crossValidation(instances, numPos);
|
---|
402 | }
|
---|
403 |
|
---|
404 | // Choose best parameter settings
|
---|
405 |
|
---|
406 | for (int b = MIN_BINS; b <= temp_MAX_BINS; b++) {
|
---|
407 |
|
---|
408 | // Find best probability and rank cutoffs
|
---|
409 |
|
---|
410 | int indexI = 0;
|
---|
411 | for (int i = theMinRankCutoff; i <= theMaxRankCutoff; i++) {
|
---|
412 | int indexP = 0;
|
---|
413 | for (double p = 1-(1.0/PROB_STEPS); p >= MIN_PROB; p=p-(1.0/PROB_STEPS)) {
|
---|
414 | currentFMeasure =
|
---|
415 | fMeasure(results[b - MIN_BINS][indexI][indexP][0],
|
---|
416 | results[b - MIN_BINS][indexI][indexP][1],
|
---|
417 | numPos - results[b - MIN_BINS][indexI][indexP][0]);
|
---|
418 | if (Utils.grOrEq(currentFMeasure, maxFMeasure)) {
|
---|
419 | theRankCutoff = i;
|
---|
420 | theProbCutoff = p;
|
---|
421 | theNumBins = b;
|
---|
422 | maxFMeasure = currentFMeasure;
|
---|
423 | }
|
---|
424 | indexP++;
|
---|
425 | }
|
---|
426 | indexI++;
|
---|
427 | }
|
---|
428 | }
|
---|
429 |
|
---|
430 | return maxFMeasure;
|
---|
431 | }
|
---|
432 |
|
---|
433 | /**
|
---|
434 | * Computes true positives and false positives for the dataset.
|
---|
435 | */
|
---|
436 |
|
---|
437 | private double[] TPandFP(Instances dataset, double[][][] bestInsts,
|
---|
438 | int rankCutoff, double probCutoff, boolean output)
|
---|
439 | throws Exception {
|
---|
440 |
|
---|
441 | double[] TPandFP = new double[2];
|
---|
442 |
|
---|
443 | // For all documents
|
---|
444 |
|
---|
445 | for (int i = 0; i < dataset.attribute(DOC_NUM_INDEX).numValues(); i++) {
|
---|
446 |
|
---|
447 | if (output)
|
---|
448 | System.out.println("Current document: "+
|
---|
449 | dataset.attribute(DOC_NUM_INDEX).value(i));
|
---|
450 |
|
---|
451 | // Up to the current rankcutoff
|
---|
452 |
|
---|
453 | for (int j = 0; j < rankCutoff; j++) {
|
---|
454 |
|
---|
455 | // Don't output trash (instances with probability 0)
|
---|
456 | if (bestInsts[i][j][0] == 0) {
|
---|
457 | break;
|
---|
458 | }
|
---|
459 |
|
---|
460 | // Find the true positives and false positives
|
---|
461 |
|
---|
462 | if (Utils.sm(bestInsts[i][j][0], probCutoff))
|
---|
463 | break;
|
---|
464 | if(dataset.instance((int)bestInsts[i][j][1]).classValue() == 0) {
|
---|
465 | TPandFP[0]++;
|
---|
466 | if (output)
|
---|
467 | System.out.println("Hit: "+dataset.instance((int)bestInsts[i][j][1])+
|
---|
468 | " "+bestInsts[i][j][0]);
|
---|
469 | } else {
|
---|
470 | TPandFP[1]++;
|
---|
471 | if (output)
|
---|
472 | System.out.println("Miss: "+dataset.instance((int)bestInsts[i][j][1])+
|
---|
473 | " "+bestInsts[i][j][0]);
|
---|
474 | }
|
---|
475 | }
|
---|
476 | }
|
---|
477 |
|
---|
478 | return TPandFP;
|
---|
479 | }
|
---|
480 |
|
---|
481 | /**
|
---|
482 | * Computes and outputs precision per document for rank cutoffs 1 to 20.
|
---|
483 | * Also outputs average precision and standard deviation.
|
---|
484 | */
|
---|
485 |
|
---|
486 | private void precisions(Instances dataset, double[][][] bestInsts)
|
---|
487 | throws Exception {
|
---|
488 |
|
---|
489 | double[] precisions;
|
---|
490 | int i, tp, fp;
|
---|
491 | double standardError, deltaConfidence;
|
---|
492 |
|
---|
493 | // For all rank cutoffs
|
---|
494 |
|
---|
495 | for (int r = 1; r <= theMaxRankCutoff; r++) {
|
---|
496 |
|
---|
497 | // For all documents
|
---|
498 |
|
---|
499 | precisions = new double[dataset.attribute(DOC_NUM_INDEX).numValues()];
|
---|
500 | for (i = 0; i < dataset.attribute(DOC_NUM_INDEX).numValues(); i++) {
|
---|
501 |
|
---|
502 | // Up to the current rankcutoff
|
---|
503 |
|
---|
504 | tp = fp = 0;
|
---|
505 | for (int j = 0; j < r; j++) {
|
---|
506 |
|
---|
507 | // Find the true positives and false positives
|
---|
508 |
|
---|
509 | if(dataset.instance((int)bestInsts[i][j][1]).classValue() == 0)
|
---|
510 | tp++;
|
---|
511 | else
|
---|
512 | fp++;
|
---|
513 | }
|
---|
514 | precisions[i] = ((double) tp) / ((double) tp + fp);
|
---|
515 | }
|
---|
516 | System.err.print(r);
|
---|
517 | System.err.print("\t"+Utils.
|
---|
518 | doubleToString(Utils.sum(precisions) / i, 6,4));
|
---|
519 | standardError = Math.sqrt(Utils.variance(precisions) / i);
|
---|
520 | deltaConfidence = Statistics.studentTConfidenceInterval(i - 1,
|
---|
521 | 0.05, standardError);
|
---|
522 | System.err.println("\t"+Utils.doubleToString(deltaConfidence, 6,4)+" &&&");
|
---|
523 | }
|
---|
524 | }
|
---|
525 |
|
---|
526 |
|
---|
527 | /**
|
---|
528 | * Computes F-Measure.
|
---|
529 | */
|
---|
530 |
|
---|
531 | private double fMeasure(double tp, double fp, double fn) {
|
---|
532 |
|
---|
533 | return (2 * tp) / ((2 * tp) + fp + fn);
|
---|
534 | }
|
---|
535 |
|
---|
536 | /**
|
---|
537 | * Computes the F-Measure for the given dataset.
|
---|
538 | */
|
---|
539 |
|
---|
540 | private double fMeasure(Instances dataset, double[][][] bestInsts,
|
---|
541 | int rankCutoff, double probCutoff, int numPos,
|
---|
542 | boolean output)
|
---|
543 | throws Exception {
|
---|
544 |
|
---|
545 | double[] TPandFP = TPandFP(dataset, bestInsts, rankCutoff, probCutoff, output);
|
---|
546 | double fn = (double)numPos - TPandFP[0];
|
---|
547 | double precision, recall;
|
---|
548 |
|
---|
549 | if (output) {
|
---|
550 | System.out.println("tp: "+TPandFP[0]+"\tfp: "+TPandFP[1]+"\tfn: "+fn);
|
---|
551 | precision = TPandFP[0]/(TPandFP[0]+TPandFP[1]);
|
---|
552 | recall = TPandFP[0]/(TPandFP[0]+fn);
|
---|
553 | System.out.println("Precision: "+precision);
|
---|
554 | System.out.println("Recall: "+recall);
|
---|
555 | System.out.println("F-measure: "+((2*precision*recall)/(precision+recall)));
|
---|
556 | }
|
---|
557 |
|
---|
558 | return fMeasure(TPandFP[0], TPandFP[1], fn);
|
---|
559 | }
|
---|
560 |
|
---|
561 | /**
|
---|
562 | * Computes the best cutoff.
|
---|
563 | */
|
---|
564 |
|
---|
565 | private void bestCutoffs(Instances dataset, double[][][] bestInsts,
|
---|
566 | int numPos) throws Exception {
|
---|
567 |
|
---|
568 | double maxFMeasure = 0, currentFMeasure;
|
---|
569 |
|
---|
570 | theRankCutoff = 0;
|
---|
571 | theProbCutoff = 1;
|
---|
572 | for (int i = theMinRankCutoff; i <= theMaxRankCutoff; i++) {
|
---|
573 | for (double p = 1-(1.0/PROB_STEPS); p >= MIN_PROB; p = p-(1.0/PROB_STEPS)) {
|
---|
574 | currentFMeasure = fMeasure(dataset, bestInsts, i, p, numPos, false);
|
---|
575 | if (Utils.grOrEq(currentFMeasure, maxFMeasure)) {
|
---|
576 | theRankCutoff = i;
|
---|
577 | theProbCutoff = p;
|
---|
578 | maxFMeasure = currentFMeasure;
|
---|
579 | }
|
---|
580 | // System.out.println("Current rank cutoff: "+i+
|
---|
581 | // "\tCurrent prob cutoff: "+p+
|
---|
582 | // "\tF-Measure: "+currentFMeasure);
|
---|
583 | }
|
---|
584 | }
|
---|
585 | System.err.println("Best rank cutoff: "+theRankCutoff+
|
---|
586 | "\tBest prob cutoff: "+theProbCutoff+
|
---|
587 | "\tF-Measure: "+maxFMeasure);
|
---|
588 | }
|
---|
589 |
|
---|
590 | /**
|
---|
591 | * Finds and deletes all missed author-defined keyphrases in the
|
---|
592 | * dataset and returns the number of positive examples.
|
---|
593 | */
|
---|
594 |
|
---|
595 | private int findMissedPositives(Instances data) throws Exception {
|
---|
596 |
|
---|
597 | int numPos = 0, missing = 0;
|
---|
598 |
|
---|
599 | System.err.println("Looking for missed keyphrases in the dataset...");
|
---|
600 |
|
---|
601 | for (int i = 0; i < data.numInstances(); i++) {
|
---|
602 | if (data.instance(i).classIsMissing()) {
|
---|
603 | missing++;
|
---|
604 | numPos++;
|
---|
605 | } else {
|
---|
606 | if (data.instance(i).classValue() == 0)
|
---|
607 | numPos++;
|
---|
608 | }
|
---|
609 | }
|
---|
610 |
|
---|
611 | System.err.println("Found " + (int)data.attribute(DOC_NUM_INDEX).numValues() +
|
---|
612 | " documents in the dataset.");
|
---|
613 | System.err.println("Total number of examples missing: "+missing);
|
---|
614 | System.err.println("Total number of positive examples: "+numPos);
|
---|
615 |
|
---|
616 | return numPos;
|
---|
617 | }
|
---|
618 |
|
---|
619 | /**
|
---|
620 | * Gets all the subphrases of an instance.
|
---|
621 | */
|
---|
622 |
|
---|
623 | public String[] getSubphrases(String phrase) {
|
---|
624 |
|
---|
625 | StringTokenizer tokenizer = new StringTokenizer(phrase);
|
---|
626 | String[] words = new String[tokenizer.countTokens()], subphrases;
|
---|
627 | int i = 0, numSubphrases = 0, s = 0;
|
---|
628 |
|
---|
629 | // Get all words in the phrase
|
---|
630 |
|
---|
631 | while (tokenizer.hasMoreElements()) {
|
---|
632 | words[i] = tokenizer.nextToken();
|
---|
633 | i++;
|
---|
634 | }
|
---|
635 |
|
---|
636 | // How many subphrases?
|
---|
637 |
|
---|
638 | for (i = 0; i < words.length - 1; i++)
|
---|
639 | numSubphrases += words.length - i;
|
---|
640 |
|
---|
641 | // Array for storing the subphrases.
|
---|
642 |
|
---|
643 | subphrases = new String[numSubphrases];
|
---|
644 |
|
---|
645 | // Compute subphrases.
|
---|
646 |
|
---|
647 | for (i = 0; i < words.length; i++) {
|
---|
648 |
|
---|
649 | // All subphrases at position i.
|
---|
650 |
|
---|
651 | for (int j = 0; j < words.length - i; j++) {
|
---|
652 |
|
---|
653 | // The subphrase of length j + 1, apart
|
---|
654 | // of the phrase itself.
|
---|
655 |
|
---|
656 | if ((i != 0) || (j != (words.length - 1))) {
|
---|
657 | if (j == 0)
|
---|
658 | subphrases[s] = words[i];
|
---|
659 | else
|
---|
660 | subphrases[s] = subphrases[s - 1]+' '+words[i + j];
|
---|
661 | s++;
|
---|
662 | }
|
---|
663 | }
|
---|
664 | }
|
---|
665 |
|
---|
666 | return subphrases;
|
---|
667 | }
|
---|
668 |
|
---|
669 | /**
|
---|
670 | * Gets the stemmed phrase of an instance.
|
---|
671 | */
|
---|
672 |
|
---|
673 | public String getPhrase(Instance instance) throws Exception {
|
---|
674 |
|
---|
675 | String string = instance.attribute(0).value((int)instance.value(0));
|
---|
676 |
|
---|
677 | return string.substring(1, string.indexOf('(') - 1);
|
---|
678 | }
|
---|
679 |
|
---|
680 | /**
|
---|
681 | * Find the theMaxRankCutoff most probable phrases for each document.
|
---|
682 | * @param data the set of instances
|
---|
683 | * @param bestInsts an array double[MAX_NUM_DOCS][theMaxRankCutoff][2]:
|
---|
684 | * double[i][j][0] contains the probability, and
|
---|
685 | * double[i][j][1] the instances index.
|
---|
686 | * @exception Exception if an instance can't be classified
|
---|
687 | */
|
---|
688 |
|
---|
689 | public void mostProbablePhrases(Instances originalData,
|
---|
690 | Instances data, double bestInsts[][][])
|
---|
691 | throws Exception {
|
---|
692 |
|
---|
693 | double[] probAndIndex, probAndIndexSubphrase;
|
---|
694 | int numDoc;
|
---|
695 | Hashtable[] hashtables = new Hashtable[originalData.attribute(DOC_NUM_INDEX).
|
---|
696 | numValues()];
|
---|
697 | String[] subphrases;
|
---|
698 | String phrase;
|
---|
699 | int theSubphraseColumn = -1, theSecKeyColumn = -1;
|
---|
700 |
|
---|
701 | if (theSubphraseColumnName.length() > 0)
|
---|
702 | theSubphraseColumn = originalData.attribute(theSubphraseColumnName).index();
|
---|
703 | if (theSecKeyColumnName.length() > 0)
|
---|
704 | theSecKeyColumn = originalData.attribute(theSecKeyColumnName).index();
|
---|
705 |
|
---|
706 | // Make a new hashtable for each document that stores the phrases.
|
---|
707 |
|
---|
708 | for (numDoc = 0; numDoc < originalData.attribute(DOC_NUM_INDEX).numValues();
|
---|
709 | numDoc++)
|
---|
710 | hashtables[numDoc] =
|
---|
711 | new Hashtable(originalData.numInstances()/
|
---|
712 | originalData.attribute(DOC_NUM_INDEX).numValues());
|
---|
713 |
|
---|
714 | // Go through all the instances and keep the best phrases in hash tables.
|
---|
715 | // (ie., their indices and probabilities.) Delete all subphrases that suck.
|
---|
716 |
|
---|
717 | for (int i = 0; i < data.numInstances(); i++) {
|
---|
718 |
|
---|
719 | // Do nothing if the class is missing.
|
---|
720 |
|
---|
721 | if (!data.instance(i).classIsMissing()) {
|
---|
722 |
|
---|
723 | // Store index and probability in an array.
|
---|
724 |
|
---|
725 | probAndIndex = new double[2];
|
---|
726 | probAndIndex[0] = theClassifier.
|
---|
727 | distributionForInstance(data.instance(i))[0];
|
---|
728 | probAndIndex[1] = (double)i;
|
---|
729 |
|
---|
730 | // Shortcut -- getting rid of phrases with very small probability.
|
---|
731 |
|
---|
732 | if (Utils.sm(probAndIndex[0], MIN_PROB))
|
---|
733 | continue;
|
---|
734 |
|
---|
735 | // Which document does the current phrase belong to?
|
---|
736 |
|
---|
737 | numDoc = (int)originalData.instance(i).value(DOC_NUM_INDEX);
|
---|
738 |
|
---|
739 | // Get the actual phrase.
|
---|
740 |
|
---|
741 | phrase = getPhrase(originalData.instance(i));
|
---|
742 |
|
---|
743 | // Put the phrase into the hashtable.
|
---|
744 |
|
---|
745 | hashtables[numDoc].put(phrase, probAndIndex);
|
---|
746 |
|
---|
747 | // Do we want to get rid of subphrases?
|
---|
748 |
|
---|
749 | if (theSubphraseColumn != -2 ) {
|
---|
750 |
|
---|
751 | // Get all subphrases.
|
---|
752 |
|
---|
753 | subphrases = getSubphrases(phrase);
|
---|
754 |
|
---|
755 | // Delete all subphrases with lower or equal probability from the
|
---|
756 | // hash table.
|
---|
757 |
|
---|
758 | for (int j = 0; j < subphrases.length; j++) {
|
---|
759 | probAndIndexSubphrase =
|
---|
760 | (double[]) hashtables[numDoc].get(subphrases[j]);
|
---|
761 | if ((probAndIndexSubphrase != null) &&
|
---|
762 | Utils.smOrEq(probAndIndexSubphrase[0], probAndIndex[0]))
|
---|
763 | if ((theSubphraseColumn == -1) ||
|
---|
764 | ((theSubphraseHi &&
|
---|
765 | Utils.smOrEq(originalData.
|
---|
766 | instance((int)probAndIndexSubphrase[1]).
|
---|
767 | value(theSubphraseColumn),
|
---|
768 | originalData.instance((int)probAndIndex[1]).
|
---|
769 | value(theSubphraseColumn))) ||
|
---|
770 | ((!theSubphraseHi) &&
|
---|
771 | Utils.grOrEq(originalData.
|
---|
772 | instance((int)probAndIndexSubphrase[1]).
|
---|
773 | value(theSubphraseColumn),
|
---|
774 | originalData.instance((int)probAndIndex[1]).
|
---|
775 | value(theSubphraseColumn)))))
|
---|
776 | hashtables[numDoc].remove(subphrases[j]);
|
---|
777 | }
|
---|
778 | }
|
---|
779 | }
|
---|
780 | }
|
---|
781 |
|
---|
782 | // Find the theMaxRankCutoff most probable phrases for each document.
|
---|
783 |
|
---|
784 | for (numDoc = 0; numDoc < originalData.attribute(DOC_NUM_INDEX).numValues();
|
---|
785 | numDoc++){
|
---|
786 | Enumeration enum = hashtables[numDoc].elements();
|
---|
787 | while (enum.hasMoreElements()) {
|
---|
788 | probAndIndex = (double[]) enum.nextElement();
|
---|
789 | for (int j = 0; j < theMaxRankCutoff; j++)
|
---|
790 | if (Utils.gr(probAndIndex[0], bestInsts[numDoc][j][0]) ||
|
---|
791 | ((theSecKeyColumn != -1) &&
|
---|
792 | Utils.eq(probAndIndex[0], bestInsts[numDoc][j][0]) &&
|
---|
793 | ((theSecKeyHi &&
|
---|
794 | Utils.gr(originalData.instance((int)probAndIndex[1]).
|
---|
795 | value(theSecKeyColumn),
|
---|
796 | originalData.instance((int)bestInsts[numDoc][j][1]).
|
---|
797 | value(theSecKeyColumn))) ||
|
---|
798 | ((!theSecKeyHi) &&
|
---|
799 | Utils.sm(originalData.instance((int)probAndIndex[1]).
|
---|
800 | value(theSecKeyColumn),
|
---|
801 | originalData.instance((int)bestInsts[numDoc][j][1]).
|
---|
802 | value(theSecKeyColumn)))))) {
|
---|
803 | for (int k = theMaxRankCutoff - 2; k >= j; k--) {
|
---|
804 | bestInsts[numDoc][k + 1][0] = bestInsts[numDoc][k][0];
|
---|
805 | bestInsts[numDoc][k + 1][1] = bestInsts[numDoc][k][1];
|
---|
806 | }
|
---|
807 | bestInsts[numDoc][j][0] = probAndIndex[0];
|
---|
808 | bestInsts[numDoc][j][1] = probAndIndex[1];
|
---|
809 | break;
|
---|
810 | }
|
---|
811 | }
|
---|
812 | }
|
---|
813 | }
|
---|
814 |
|
---|
815 | /**
|
---|
816 | * Method that builds classifier.
|
---|
817 | * @exception Exception if classifier can't be built successfully
|
---|
818 | */
|
---|
819 |
|
---|
820 | public void buildClassifier(InputStream inputStream) throws Exception {
|
---|
821 |
|
---|
822 | Instances instances, instancesDel, instancesDisc;
|
---|
823 | DeleteFilter del;
|
---|
824 | int numPos;
|
---|
825 | double[][][] bestInsts =
|
---|
826 | new double[MAX_NUM_DOCS][theMaxRankCutoff][2];
|
---|
827 |
|
---|
828 | System.err.println("Loading data...");
|
---|
829 |
|
---|
830 | instances = new Instances(inputStream);
|
---|
831 | instances.setClassIndex(instances.numAttributes() - 1);
|
---|
832 |
|
---|
833 | // Number of positive examples?
|
---|
834 |
|
---|
835 | System.err.println("Counting positive examples...");
|
---|
836 |
|
---|
837 | numPos = findMissedPositives(instances);
|
---|
838 |
|
---|
839 | if ((theNumPhrases <= 0) && (theMinProb < 0) && (!theUseTrain)) {
|
---|
840 |
|
---|
841 | // Cross-validation for parameter settings
|
---|
842 |
|
---|
843 | System.err.println("Performing cross-validation...");
|
---|
844 |
|
---|
845 | System.err.println("F-measure: (cross-validation) "+
|
---|
846 | optimizeFMeasure(instances, numPos));
|
---|
847 |
|
---|
848 | System.err.println("Deleting columns...");
|
---|
849 | }
|
---|
850 |
|
---|
851 | System.err.println("Deleting columns...");
|
---|
852 |
|
---|
853 | // Deleting columns
|
---|
854 |
|
---|
855 | del = new DeleteFilter();
|
---|
856 | del.setInvertSelection(true);
|
---|
857 | del.setAttributeIndices(indices(instances));
|
---|
858 | del.inputFormat(instances);
|
---|
859 | instancesDel = Filter.useFilter(instances, del);
|
---|
860 |
|
---|
861 | if (theUseDiscretization) {
|
---|
862 |
|
---|
863 | System.err.println("Discretizing columns...");
|
---|
864 |
|
---|
865 | // Discretizing columns
|
---|
866 |
|
---|
867 | theDisc = new DiscretiseFilter();
|
---|
868 | theDisc.setAttributeIndices(DISC_RANGE);
|
---|
869 | if (theEqual) {
|
---|
870 | theDisc.setBins(theNumBins);
|
---|
871 | theDisc.setClassIndex(-1);
|
---|
872 | theDisc.setUseMDL(false);
|
---|
873 | } else
|
---|
874 | theDisc.setClassIndex(instancesDel.numAttributes());
|
---|
875 | theDisc.inputFormat(instancesDel);
|
---|
876 | instancesDisc = Filter.useFilter(instancesDel, theDisc);
|
---|
877 | } else
|
---|
878 | instancesDisc = instancesDel;
|
---|
879 |
|
---|
880 | System.err.println("Building classifier...");
|
---|
881 |
|
---|
882 | // Build the classifier.
|
---|
883 |
|
---|
884 | theClassifier.buildClassifier(instancesDisc);
|
---|
885 |
|
---|
886 | System.err.println("\n"+theClassifier);
|
---|
887 |
|
---|
888 | if ((theNumPhrases <= 0) && (theUseTrain) && (theMinProb < 0)) {
|
---|
889 |
|
---|
890 | System.err.println("Finding most probable keyphrases...");
|
---|
891 |
|
---|
892 | // Find the most probable keyphrases.
|
---|
893 |
|
---|
894 | mostProbablePhrases(instances, instancesDisc, bestInsts);
|
---|
895 |
|
---|
896 | System.err.println("Finding best cutoffs...");
|
---|
897 |
|
---|
898 | // Find the best cutoffs.
|
---|
899 |
|
---|
900 | bestCutoffs(instances, bestInsts, numPos);
|
---|
901 | } else if ((theNumPhrases > 0) || (theMinProb >= 0)) {
|
---|
902 | theRankCutoff = theNumPhrases;
|
---|
903 | theProbCutoff = theMinProb;
|
---|
904 | }
|
---|
905 | }
|
---|
906 |
|
---|
907 | /**
|
---|
908 | * Method that classifies phrases.
|
---|
909 | * @exception Exception if phrases can't be classified successfully.
|
---|
910 | */
|
---|
911 |
|
---|
912 | public void classifyPhrases(InputStream inputStream) throws Exception {
|
---|
913 |
|
---|
914 | Instances instances, instancesDel, instancesDisc;
|
---|
915 | DeleteFilter del;
|
---|
916 | int numPos;
|
---|
917 | double[][][] bestInsts =
|
---|
918 | new double[MAX_NUM_DOCS][theMaxRankCutoff][2];
|
---|
919 |
|
---|
920 | if ((theNumPhrases > 0) || (theMinProb >= 0)) {
|
---|
921 | theRankCutoff = theNumPhrases;
|
---|
922 | theProbCutoff = theMinProb;
|
---|
923 | }
|
---|
924 |
|
---|
925 | System.err.println("Preparing data...");
|
---|
926 |
|
---|
927 | // Prepare data and find total number of positive
|
---|
928 | // examples.
|
---|
929 |
|
---|
930 | instances = new Instances(inputStream);
|
---|
931 | instances.setClassIndex(instances.numAttributes() - 1);
|
---|
932 |
|
---|
933 | System.err.println("Counting positive examples...");
|
---|
934 |
|
---|
935 | // Number of positive examples?
|
---|
936 |
|
---|
937 | numPos = findMissedPositives(instances);
|
---|
938 |
|
---|
939 | System.err.println("Deleting columns...");
|
---|
940 |
|
---|
941 | // Deleting columns
|
---|
942 |
|
---|
943 | del = new DeleteFilter();
|
---|
944 | del.setInvertSelection(true);
|
---|
945 | del.setAttributeIndices(indices(instances));
|
---|
946 | del.inputFormat(instances);
|
---|
947 | instancesDel = Filter.useFilter(instances, del);
|
---|
948 |
|
---|
949 | if (theUseDiscretization)
|
---|
950 |
|
---|
951 | // Discretizing columns
|
---|
952 |
|
---|
953 | instancesDisc = Filter.useFilter(instancesDel, theDisc);
|
---|
954 | else
|
---|
955 | instancesDisc = instancesDel;
|
---|
956 |
|
---|
957 | System.err.println("Finding most probable keyphrases...");
|
---|
958 |
|
---|
959 | // Find the most probable keyphrases.
|
---|
960 |
|
---|
961 | mostProbablePhrases(instances, instancesDisc, bestInsts);
|
---|
962 |
|
---|
963 | System.err.println("Computing statistics...");
|
---|
964 |
|
---|
965 | // Compute F-Measure.
|
---|
966 |
|
---|
967 | System.err.println("Rank cutoff: "+theRankCutoff);
|
---|
968 | System.err.println("Probability cutoff: "+theProbCutoff);
|
---|
969 |
|
---|
970 | System.err.println("F-Measure: "+
|
---|
971 | fMeasure(instances, bestInsts, theRankCutoff,
|
---|
972 | theProbCutoff, numPos, true));
|
---|
973 |
|
---|
974 | // Output average precisions for ranks and confidence intervals
|
---|
975 |
|
---|
976 | System.err.println("Rank\tPrec.\tConf.");
|
---|
977 | precisions(instances, bestInsts);
|
---|
978 | }
|
---|
979 |
|
---|
980 | /**
|
---|
981 | * Generate results for different size training sets.
|
---|
982 | */
|
---|
983 |
|
---|
984 | public void iterateTrainingSets(InputStream train, InputStream test)
|
---|
985 | throws Exception {
|
---|
986 |
|
---|
987 | int[] sizes = new int[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130};
|
---|
988 | Instances trainInst, testInst, tempTrainInst, tempTrainInstDisc, testInstDisc;
|
---|
989 | SelectFilter selFilter;
|
---|
990 | DeleteFilter del;
|
---|
991 | int[] indices;
|
---|
992 | int numPos;
|
---|
993 | double[][][] bestInsts;
|
---|
994 |
|
---|
995 | trainInst = new Instances(train);
|
---|
996 | trainInst.setClassIndex(trainInst.numAttributes() - 1);
|
---|
997 | testInst = new Instances(test);
|
---|
998 | testInst.setClassIndex(testInst.numAttributes() - 1);
|
---|
999 | for (int i = 0; i < sizes.length; i++) {
|
---|
1000 |
|
---|
1001 | System.err.println("Processing "+sizes[i]+" documents");
|
---|
1002 |
|
---|
1003 | // Delete documents
|
---|
1004 |
|
---|
1005 | System.err.println("Selecting documents...");
|
---|
1006 |
|
---|
1007 | selFilter = new SelectFilter();
|
---|
1008 | indices = new int[sizes[i]];
|
---|
1009 | for (int j = 0; j < sizes[i]; j++)
|
---|
1010 | indices[j] = j;
|
---|
1011 | selFilter.setInvertSelection(false);
|
---|
1012 | selFilter.setAttributeIndex(2);
|
---|
1013 | selFilter.setNominalIndicesArr(indices);
|
---|
1014 | selFilter.inputFormat(trainInst);
|
---|
1015 | tempTrainInst = Filter.useFilter(trainInst, selFilter);
|
---|
1016 |
|
---|
1017 | numPos = findMissedPositives(tempTrainInst);
|
---|
1018 |
|
---|
1019 | System.err.println("Deleting columns...");
|
---|
1020 |
|
---|
1021 | // Deleting columns
|
---|
1022 |
|
---|
1023 | del = new DeleteFilter();
|
---|
1024 | del.setInvertSelection(true);
|
---|
1025 | del.setAttributeIndices(indices(tempTrainInst));
|
---|
1026 | del.inputFormat(tempTrainInst);
|
---|
1027 | tempTrainInstDisc = Filter.useFilter(tempTrainInst, del);
|
---|
1028 |
|
---|
1029 | if (theUseDiscretization) {
|
---|
1030 |
|
---|
1031 | System.err.println("Discretizing columns...");
|
---|
1032 |
|
---|
1033 | // Discretizing columns
|
---|
1034 |
|
---|
1035 | theDisc = new DiscretiseFilter();
|
---|
1036 | theDisc.setAttributeIndices(DISC_RANGE);
|
---|
1037 | if (theEqual) {
|
---|
1038 | theDisc.setBins(theNumBins);
|
---|
1039 | theDisc.setClassIndex(-1);
|
---|
1040 | theDisc.setUseMDL(false);
|
---|
1041 | } else
|
---|
1042 | theDisc.setClassIndex(tempTrainInstDisc.numAttributes());
|
---|
1043 | theDisc.inputFormat(tempTrainInstDisc);
|
---|
1044 | tempTrainInstDisc = Filter.useFilter(tempTrainInstDisc, theDisc);
|
---|
1045 | }
|
---|
1046 |
|
---|
1047 | System.err.println("Building classifier...");
|
---|
1048 |
|
---|
1049 | // Build the classifier.
|
---|
1050 |
|
---|
1051 | theClassifier.buildClassifier(tempTrainInstDisc);
|
---|
1052 |
|
---|
1053 | System.err.println("\n"+theClassifier);
|
---|
1054 |
|
---|
1055 | if ((theNumPhrases <= 0) && (theUseTrain) && (theMinProb < 0)) {
|
---|
1056 |
|
---|
1057 | System.err.println("Finding most probable keyphrases...");
|
---|
1058 |
|
---|
1059 | // Find the most probable keyphrases.
|
---|
1060 |
|
---|
1061 | bestInsts =
|
---|
1062 | new double[MAX_NUM_DOCS][theMaxRankCutoff][2];
|
---|
1063 | mostProbablePhrases(tempTrainInst, tempTrainInstDisc, bestInsts);
|
---|
1064 |
|
---|
1065 | System.err.println("Finding best cutoffs...");
|
---|
1066 |
|
---|
1067 | // Find the best cutoffs.
|
---|
1068 |
|
---|
1069 | bestCutoffs(tempTrainInst, bestInsts, numPos);
|
---|
1070 | } else if ((theNumPhrases > 0) || (theMinProb >= 0)) {
|
---|
1071 | theRankCutoff = theNumPhrases;
|
---|
1072 | theProbCutoff = theMinProb;
|
---|
1073 | }
|
---|
1074 |
|
---|
1075 | // Number of positive examples?
|
---|
1076 |
|
---|
1077 | numPos = findMissedPositives(testInst);
|
---|
1078 |
|
---|
1079 | System.err.println("Deleting columns...");
|
---|
1080 |
|
---|
1081 | // Deleting columns
|
---|
1082 |
|
---|
1083 | del = new DeleteFilter();
|
---|
1084 | del.setInvertSelection(true);
|
---|
1085 | del.setAttributeIndices(indices(testInst));
|
---|
1086 | del.inputFormat(testInst);
|
---|
1087 | testInstDisc = Filter.useFilter(testInst, del);
|
---|
1088 |
|
---|
1089 | // Discretizing columns
|
---|
1090 |
|
---|
1091 | testInstDisc = Filter.useFilter(testInstDisc, theDisc);
|
---|
1092 |
|
---|
1093 | System.err.println("Finding most probable keyphrases...");
|
---|
1094 |
|
---|
1095 | // Find the most probable keyphrases.
|
---|
1096 |
|
---|
1097 | bestInsts =
|
---|
1098 | new double[MAX_NUM_DOCS][theMaxRankCutoff][2];
|
---|
1099 | mostProbablePhrases(testInst, testInstDisc, bestInsts);
|
---|
1100 |
|
---|
1101 | System.err.println("Computing statistics...");
|
---|
1102 |
|
---|
1103 | // Compute F-Measure.
|
---|
1104 |
|
---|
1105 | System.err.println("Rank cutoff: "+theRankCutoff);
|
---|
1106 | System.err.println("Probability cutoff: "+theProbCutoff);
|
---|
1107 |
|
---|
1108 | System.err.println("F-Measure: "+
|
---|
1109 | fMeasure(testInst, bestInsts, theRankCutoff,
|
---|
1110 | theProbCutoff, numPos, false));
|
---|
1111 |
|
---|
1112 | // Output average precisions for ranks and confidence intervals
|
---|
1113 |
|
---|
1114 | System.err.println("Rank\tPrec.\tConf.");
|
---|
1115 | precisions(testInst, bestInsts);
|
---|
1116 | }
|
---|
1117 | }
|
---|
1118 |
|
---|
1119 | /**
|
---|
1120 | * Returns an enumeration describing the available options
|
---|
1121 | * @return an enumeration of all the available options
|
---|
1122 | */
|
---|
1123 |
|
---|
1124 | public Enumeration listOptions() {
|
---|
1125 |
|
---|
1126 | Vector newVector = new Vector(14);
|
---|
1127 |
|
---|
1128 | newVector.addElement(new Option(
|
---|
1129 | "\tName of the training file.",
|
---|
1130 | "t", 1, "-t <training file>"));
|
---|
1131 | newVector.addElement(new Option(
|
---|
1132 | "\tName of the test file.",
|
---|
1133 | "T", 1, "-T <test file>"));
|
---|
1134 | newVector.addElement(new Option(
|
---|
1135 | "\tSpecify list of columns to to be used. Class column has to\n"+
|
---|
1136 | "\tbe listed.",
|
---|
1137 | "R", 1, "-R <name1,name2-name4,...>"));
|
---|
1138 | newVector.addElement(new Option(
|
---|
1139 | "\tUse equal width discretization.",
|
---|
1140 | "E", 0, "-E"));
|
---|
1141 | newVector.addElement(new Option(
|
---|
1142 | "\tDelete unlikely subphrases. Takes column to be used\n"+
|
---|
1143 | "\tin conjunction with probability to delete unlikely\n"+
|
---|
1144 | "\tsubphrases. Higher value is better. (\"\" means no column)",
|
---|
1145 | "S", 1, "-S <column name>"));
|
---|
1146 | newVector.addElement(new Option(
|
---|
1147 | "\tDelete unlikely subphrases. Takes column to be used in\n"+
|
---|
1148 | "\tconjunction with probability to delete unlikely subphrases.\n"+
|
---|
1149 | "\tLower value is better. (\"\" means no column)",
|
---|
1150 | "s", 1, "-s <column name>"));
|
---|
1151 | newVector.addElement(new Option(
|
---|
1152 | "\tSecondary key for sorting phrases in table. Primary key:\n"+
|
---|
1153 | "\tprobability. Higher value is better. (\"\" means no secondary key)",
|
---|
1154 | "K", 1, "-K <column name>"));
|
---|
1155 | newVector.addElement(new Option(
|
---|
1156 | "\tSecondary key for sorting phrases in table. Primary key:\n"+
|
---|
1157 | "\tprobability. Lower value is better. (\"\" means no secondary key)",
|
---|
1158 | "k", 1, "-k <column name>"));
|
---|
1159 | newVector.addElement(new Option(
|
---|
1160 | "\tColumn to be used for batched cross-validation.\n"+
|
---|
1161 | "\t(\"\" means standard 5-fold cross-validation)",
|
---|
1162 | "B", 1, "-B <column name>"));
|
---|
1163 | newVector.addElement(new Option(
|
---|
1164 | "\tThe maximum rank cutoff. (Default 10)",
|
---|
1165 | "X", 1, "-X <rank cutoff>"));
|
---|
1166 | newVector.addElement(new Option(
|
---|
1167 | "\tThe minimum rank cutoff. (Default 1)",
|
---|
1168 | "M", 1, "-M <rank cutoff>"));
|
---|
1169 | newVector.addElement(new Option(
|
---|
1170 | "\tDon't optimize F-measure. Just output the given number of\n"+
|
---|
1171 | "\tphrases.",
|
---|
1172 | "N", 1, "-N <number>"));
|
---|
1173 | newVector.addElement(new Option(
|
---|
1174 | "\tDon't optimize F-measure. Just output phrases with probablity\n"+
|
---|
1175 | "\tgreater or equal to the given one.",
|
---|
1176 | "P", 1, "-P <prob>"));
|
---|
1177 |
|
---|
1178 | return newVector.elements();
|
---|
1179 | }
|
---|
1180 |
|
---|
1181 |
|
---|
1182 | /**
|
---|
1183 | * Parses a given list of options.
|
---|
1184 | * @param options the list of options as an array of strings
|
---|
1185 | * @exception Exception if an option is not supported
|
---|
1186 | */
|
---|
1187 |
|
---|
1188 | public void parseOptions(String[] options) throws Exception {
|
---|
1189 |
|
---|
1190 | String numPhrases = Utils.getOption('N', options);
|
---|
1191 | if (numPhrases.length() != 0)
|
---|
1192 | theNumPhrases = Integer.parseInt(numPhrases);
|
---|
1193 |
|
---|
1194 | String minProb = Utils.getOption('P', options);
|
---|
1195 | if (minProb.length() != 0)
|
---|
1196 | theMinProb = (new Double(minProb)).doubleValue();
|
---|
1197 |
|
---|
1198 | String useList = Utils.getOption('R', options);
|
---|
1199 | if (useList.length() != 0) {
|
---|
1200 | StringTokenizer st = new StringTokenizer(useList, ",", false);
|
---|
1201 | theCols = new String[st.countTokens()];
|
---|
1202 | for (int i = 0; i < theCols.length; i++)
|
---|
1203 | theCols[i] = st.nextToken();
|
---|
1204 | }
|
---|
1205 |
|
---|
1206 | if (Utils.getFlag('E', options)) {
|
---|
1207 | theEqual = true;
|
---|
1208 | }
|
---|
1209 |
|
---|
1210 | if (Utils.getFlag('D', options)) {
|
---|
1211 | theUseDiscretization = false;
|
---|
1212 | }
|
---|
1213 |
|
---|
1214 | String subphrases = Utils.getOption('S', options);
|
---|
1215 | if (subphrases.length() != 0) {
|
---|
1216 | theSubphraseColumnName = subphrases;
|
---|
1217 | theSubphraseHi = true;
|
---|
1218 | }
|
---|
1219 |
|
---|
1220 | String subphrasesII = Utils.getOption('s', options);
|
---|
1221 | if (subphrasesII.length() != 0) {
|
---|
1222 | if (subphrases.length() != 0)
|
---|
1223 | throw new Exception("Can't use -S and -s at the same time!");
|
---|
1224 | theSubphraseColumnName = subphrasesII;
|
---|
1225 | theSubphraseHi = false;
|
---|
1226 | }
|
---|
1227 |
|
---|
1228 | String key = Utils.getOption('K', options);
|
---|
1229 | if (key.length() != 0) {
|
---|
1230 | theSecKeyColumnName = key;
|
---|
1231 | theSecKeyHi = true;
|
---|
1232 | }
|
---|
1233 |
|
---|
1234 | String keyII = Utils.getOption('k', options);
|
---|
1235 | if (keyII.length() != 0) {
|
---|
1236 | if (key.length() != 0)
|
---|
1237 | throw new Exception("Can't use -K and -k at the same time!");
|
---|
1238 | theSecKeyColumnName = keyII;
|
---|
1239 | theSecKeyHi = false;
|
---|
1240 | }
|
---|
1241 |
|
---|
1242 | String cv = Utils.getOption('B', options);
|
---|
1243 | if (cv.length() != 0) {
|
---|
1244 | theUseTrain = false;
|
---|
1245 | theCVColumnName = cv;
|
---|
1246 | }
|
---|
1247 |
|
---|
1248 | String maxRankCutoff = Utils.getOption('X', options);
|
---|
1249 | if (maxRankCutoff.length() != 0)
|
---|
1250 | theMaxRankCutoff = Integer.parseInt(maxRankCutoff);
|
---|
1251 |
|
---|
1252 | String minRankCutoff = Utils.getOption('M', options);
|
---|
1253 | if (minRankCutoff.length() != 0)
|
---|
1254 | theMinRankCutoff = Integer.parseInt(minRankCutoff);
|
---|
1255 |
|
---|
1256 | if (theNumPhrases > theMaxRankCutoff)
|
---|
1257 | theMaxRankCutoff = theNumPhrases;
|
---|
1258 | if (theNumPhrases < theMinRankCutoff)
|
---|
1259 | theMinRankCutoff = theNumPhrases;
|
---|
1260 | }
|
---|
1261 |
|
---|
1262 | // ======================================
|
---|
1263 | // Main method for command line interface
|
---|
1264 | // ======================================
|
---|
1265 |
|
---|
1266 | public static void main(String[] opts) {
|
---|
1267 |
|
---|
1268 | KEP kep;
|
---|
1269 | BufferedInputStream trainFile, testFile;
|
---|
1270 | ObjectInputStream modelIn;
|
---|
1271 | ObjectOutputStream modelOut;
|
---|
1272 |
|
---|
1273 | try{
|
---|
1274 |
|
---|
1275 | if (Utils.getFlag('I', opts)) {
|
---|
1276 | System.err.println("Iterating over training sets...");
|
---|
1277 | String train = Utils.getOption('t', opts);
|
---|
1278 | String test = Utils.getOption('T', opts);
|
---|
1279 | kep = new KEP();
|
---|
1280 | kep.parseOptions(opts);
|
---|
1281 | trainFile = new BufferedInputStream(new FileInputStream(train));
|
---|
1282 | testFile = new BufferedInputStream(new FileInputStream(test));
|
---|
1283 | kep.iterateTrainingSets(trainFile, testFile);
|
---|
1284 | } else {
|
---|
1285 | String model = Utils.getOption('m', opts);
|
---|
1286 | String train = Utils.getOption('t', opts);
|
---|
1287 | if (model.length() != 0)
|
---|
1288 | if (train.length() != 0) {
|
---|
1289 | System.err.println("Building classifier and saving it...");
|
---|
1290 | kep = new KEP();
|
---|
1291 | kep.parseOptions(opts);
|
---|
1292 | trainFile = new BufferedInputStream(new FileInputStream(train));
|
---|
1293 | kep.buildClassifier(trainFile);
|
---|
1294 | modelOut =
|
---|
1295 | new ObjectOutputStream(new
|
---|
1296 | GZIPOutputStream(new FileOutputStream(model)));
|
---|
1297 | modelOut.writeObject(kep);
|
---|
1298 | modelOut.flush();
|
---|
1299 | modelOut.close();
|
---|
1300 | } else {
|
---|
1301 | System.err.println("Loading classifier...");
|
---|
1302 | modelIn =
|
---|
1303 | new ObjectInputStream(new
|
---|
1304 | GZIPInputStream(new FileInputStream(model)));
|
---|
1305 | kep = (KEP) modelIn.readObject();
|
---|
1306 | modelIn.close();
|
---|
1307 | kep.parseOptions(opts);
|
---|
1308 | }
|
---|
1309 | else if (train.length() != 0) {
|
---|
1310 | System.err.println("Building classifier...");
|
---|
1311 | kep = new KEP();
|
---|
1312 | kep.parseOptions(opts);
|
---|
1313 | trainFile = new BufferedInputStream(new FileInputStream(train));
|
---|
1314 | kep.buildClassifier(trainFile);
|
---|
1315 | } else
|
---|
1316 | throw new Exception("Neither classifier nor training file given!");
|
---|
1317 |
|
---|
1318 | String test = Utils.getOption('T', opts);
|
---|
1319 | if (test.length() != 0) {
|
---|
1320 | System.err.println("Classifying phrases...");
|
---|
1321 | testFile = new BufferedInputStream(new FileInputStream(test));
|
---|
1322 | kep.classifyPhrases(testFile);
|
---|
1323 | }
|
---|
1324 | }
|
---|
1325 | } catch (Exception e) {
|
---|
1326 | System.err.println("\nOptions:\n");
|
---|
1327 | kep = new KEP();
|
---|
1328 | Enumeration enum = kep.listOptions();
|
---|
1329 | while (enum.hasMoreElements()) {
|
---|
1330 | Option option = (Option) enum.nextElement();
|
---|
1331 | System.err.println(option.synopsis()+'\n'+option.description());
|
---|
1332 | }
|
---|
1333 | System.err.println();
|
---|
1334 | e.printStackTrace();
|
---|
1335 | }
|
---|
1336 | }
|
---|
1337 | }
|
---|
1338 |
|
---|