source: trunk/gsdl/perllib/Kea-1.1.4/k4.pl@ 1972

Last change on this file since 1972 was 1972, checked in by jmt14, 23 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 22.4 KB
Line 
1#!/usr/bin/perl -w
2
3# k4.pl
4# Version 1.1
5
6# Kea -- Automatic Keyphrase Extraction
7# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8# Contact [email protected] or [email protected]
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24# Version history
25#
26# 1.0 Ready to rebuild all the data for Witten et. al. 1998
27# 1.0.1 bug: empty @attribute enumerations cause java crashes.
28# 1.0.2 output number of documents found
29# 1.0.3 bug: if -F and !defined($df{phrase}) then divide by zero crash.
30# 1.0.4 Default attributes reduced to phrase, document, tfidf, first, class
31# 1.0.5 Add keyphrase-frequency file option and attribute
32# 1.0.6 keyphrase-frequency file corrects for covering input
33# 1.0.7 use kea-tidy-key-file.pl | stemmer to clean up key files
34# 1.0.8 -t toggle uses .tagged instead of .clauses files
35# 1.0.9 Cleaned up output so it goes to STDERR not STDOUT
36# 1.0.10 Use $perl_command for system-independence
37# 1.0.11 List command line parameters if no arguments are given.
38# 1.1 First Distribution. GPL added. Documentation improved.
39# 1.1.1 Tweaked output a little; no changes to function
40
41print STDERR "\nk4.pl: the arff file builder for Kea\n";
42
43$gsdlhome = $ENV{'GSDLHOME'};
44
45# Command line options
46# k4.pl [options] <input-directory> <output-arff-file>
47# options are:
48# -abciFs
49# -L max length
50# -f <df file>
51# -S <stopword file>
52
53require("getopts.pl");
54&Getopts("abciL:f:FsS:kK:t");
55
56# What files shall we use?
57if (!$ARGV[0] || !$ARGV[1]) {
58 print STDERR "usage: k4.pl [options] <input-directory> <output-arff-file>\n";
59 print STDERR " options: -a stopword mode (default)\n";
60 print STDERR " -b brill-phrase mode (requires -t)\n";
61 print STDERR " -c carl-phrase mode (requires -t)\n";
62 print STDERR " -f [global frequency file]\n";
63 print STDERR " -F global frequency file includes input files\n";
64 print STDERR " -i ignore stopwords in word counts\n";
65 print STDERR " -K [keyword frequency file]\n";
66 print STDERR " -k keyword frequency file includes input files\n";
67 print STDERR " -L [maximum phrase length] in words (default = 3)\n";
68 print STDERR " -S [stopword file]\n";
69 print STDERR " -s keep singleton phrases\n";
70 print STDERR " -t input files are *.tagged (default is *.clauses)\n";
71 die "\n";
72}
73$directory = $ARGV[0];
74$directory =~ s/\/$//;
75$arfffile = $ARGV[1];
76#print STDERR "Input directory: $directory\n";
77#print STDERR "Output Arff file: $arfffile\n";
78
79
80# Set the maximum phrase length
81if (($opt_L) && ($opt_L =~ /^\d$/) && ($opt_L > 0)) {
82 $max_phrase_length = $opt_L;
83} else {
84 $max_phrase_length = 3;
85}
86print STDERR "Maximum phrase length: $max_phrase_length\n";
87
88
89# Are we in Stopword mode, Brill mode, or Carl mode?
90if ($opt_b && $opt_c) {
91 print STDERR "Warning: Brill phrase option overriding Carl phrase option.\n";
92
93} elsif ($opt_b) {
94 # Brill phrases
95 print STDERR "Brill phrase mode\n";
96 $brill_mode = 1;
97
98 $brill_n = "/(NN|NNS|NNP|NNPS|VBG)\$";
99 $brill_nj = "/(NN|NNS|NNP|NNPS|JJ)\$";
100
101} elsif ($opt_c) {
102 # Carl phrases
103 print STDERR "Carl phrase mode\n";
104 $carl_mode = 1;
105
106 # regular expressions used to determine brillity
107 $WORD = "[a-zA-Z0-9]+(\-?)[a-zA-Z0-9]+";
108 $NOUN = "($WORD\/NN[A-Z]*)";
109 $ADJ = "($WORD\/JJ[A-Z]*)";
110 $VBG = "($WORD\ing\/VBG)";
111 $VBN = "($WORD\ed\/VBN)";
112
113 $VB = "($WORD\/VB)";
114 $VBP = "($WORD\/VBP)";
115 $VERB = "$VBN|$VB|$VBG|$VBP";
116
117 # The OEDs conjunctions -- Carl's stuff
118 $OED_CONJ = "((after|albeit|although|an|and|as|because|before|but|cos|directly|either|except|for|howbeit|if|immediately|instantly|lest|like|likewise|neither|nevertheless|nisi|nor|notwithstanding|now|only|or|otherwise|provided|providing|qua|since|so|supposing|than|that|tho|though|till|unless|until|when|whencesoever|whenever|whereas|whether|while|whilst|yet)\/CC)";
119
120 # The OED's list of prepositions -- Carl's stuff
121 $OED_PREP = "((abaft aboard about above across afore after against agin along alongside amid amidst among amongst anent around aslant astride at athwart bar barring before behind below beneath beside besides between betwixt beyond but by circa concerning considering despite down during ere except excepting failing for from in inside into less like mid midst minus near neath next nigh nigher nighest notwithstanding oer of off on onto outside over past pending per plus qua re respecting round sans save saving since thro through throughout thru till to touching toward towards tween twixt under underneath unlike until unto up upon versus via vice wanting with within without)\/IN)";
122
123 # The OED based Carl-phrase wordlist
124 $INCC = "$OED_CONJ|$OED_PREP";
125
126
127
128} else {
129 # stopword mode
130 #print STDERR "Stopword mode\n";
131 $stopword_mode = 1;
132
133 $stopword_file = "stopwords";
134 if ($opt_S) {
135 if (-e "$opt_S") {
136 $stopword_file = $opt_S;
137 } else {
138 print STDERR "Can't find stopword file: $opt_S\n";
139 }
140 }
141 print STDERR "Using stopword file: $stopword_file\n";
142
143 # read the stopwords
144 open(S, "<$stopword_file");
145 while (<S>) {
146 chomp;
147 $stopword{lc($_)} = 1;
148 }
149 close(S);
150}
151
152
153# Should we ignore stopwords in word counts?
154if ($opt_i && $opt_i) {
155 print STDERR "Ignoring stopwords in word counts.\n";
156 $use_all_words_in_word_count = 0;
157} else {
158 #print STDERR "Using all words (including stopwords) in word count (default).\n";
159 $use_all_words_in_word_count = 1;
160}
161
162
163# Which global-frequency file are we using
164if ($opt_f) {
165 $document_frequency_file = $opt_f;
166} else {
167 $document_frequency_file = "document-frequencies";
168}
169print STDERR "Document frequency file: $document_frequency_file\n";
170
171
172# How is the global-frequency file created?
173# Note: $testfile is added to the df figure for each phrase.
174# If the current document is not part of the document frequencies
175# file then the df figure needs to be increased by 1 to allow
176# for the fact that the phrase also occurs in the the current file.
177
178if ($opt_F && (-e "$document_frequency_file")) {
179 print STDERR "Text files are covered by specified document frequency file\n";
180 $testfile = 0;
181} elsif ($opt_F) {
182 die "Document frequency file doesn't exist, -F option impossible\n";
183} elsif (-e "$document_frequency_file") {
184 print STDERR "Text files are not covered by document frequency file\n";
185 $testfile = 1;
186} else {
187 print STDERR "Document frequency file does not exist: create from training data\n";
188 $testfile = 0;
189}
190
191# Which keyword-frequency file are we using
192if ($opt_K && (-e $opt_K)) {
193 $keyword_frequency_file = $opt_K;
194 print STDERR "Keyword frequency file: $keyword_frequency_file\n";
195 if ($opt_k && $opt_k) {
196 print STDERR "Keyword frequency file covers input files\n";
197 $kf_covers_input = 1;
198 } else {
199 print STDERR "Keyword frequency is independent of input\n";
200 $kf_covers_input = 0;
201 }
202} elsif ($opt_K) {
203 die "Keyword frequency file doesn't exist, -K option impossible\n";
204} else {
205 $keyword_frequency_file = "";
206 print STDERR "No keyword frequency file\n";
207}
208
209
210# Should we show singleton phrases?
211if ($opt_s && $opt_s) {
212 $suppress_singleton_phrases = 0;
213 print STDERR "Including singleton phrases.\n"
214} else {
215 $suppress_singleton_phrases = 1;
216 print STDERR "Ignoring singleton phrases (default).\n"
217}
218
219
220# Do we look for *.tagged or *.clauses?
221$suffix = "clauses";
222if ($opt_t && $opt_t) {
223 $suffix = "tagged";
224}
225print STDERR "Input file suffix: $suffix\n";
226
227# How to run scripts etc
228$perl_command = "perl -w";
229
230# Are we using Turney's data
231$Turney = 0;
232
233
234
235
236# Start keyphrase extraction
237
238# find the documents
239@documents = split(/\s+/, `find $directory/ -name "*.$suffix"`);
240if (!$documents[0]) {
241 die "\nk4.pl error: no documents found.";
242} else {
243 print STDERR "\nProducing keyphrases for ", $#documents + 1, " documents\n";
244}
245
246print STDERR "Finding candidate phrases...\n";
247$document_number = 1;
248
249foreach $document (@documents) {
250 #print STDERR " document $document_number: $document\r";
251 $document_number++;
252
253 # if we've already produced this file, skip it.
254 next if (-e "$document.stemmed");
255
256 open(F, "<$document");
257
258 #open(S, "| ./stemmer > $document.stemmed");
259 open(S, "| $gsdlhome/perllib/Kea-1.1.4/stemmer > $document.stemmed");
260 open(U, ">$document.unstemmed");
261 open(D, ">$document.distance");
262
263 $distance = 0;
264
265 while (<F>) {
266 chomp;
267
268 # if we're in stopword mode, throw away the tags
269 if ($stopword_mode) {
270 s/\/\S+//g;
271 }
272
273 #work through each line
274 @words = split(/ +/, $_);
275
276 foreach $i (0..$#words) {
277 $finalword = $words[$i];
278
279 # calculate diatance from start (-i not set)
280 if ($use_all_words_in_word_count) {
281 $distance ++;
282 }
283
284 # make sure the last word in the phrase is okay
285 if ($stopword_mode) {
286 # the last word must not be a stopword
287 next if $stopword{lc($finalword)};
288 } elsif ($brill_mode) {
289 # the last word must match ?
290 next unless ($finalword =~ /$brill_n/);
291 } elsif ($carl_mode) {
292 # the last word must be either a noun or a vbg
293 next unless (($finalword =~ /^$VBG$/) ||
294 ($finalword =~ /^$NOUN$/));
295 }
296
297 # calculate distance from start in -i mode
298 # (this is still flawed for phrases containing stopwords)
299 if (!$use_all_words_in_word_count) {
300 $distance ++;
301 }
302
303 # print $finalword as a phrase of length 1
304 if ($stopword_mode) {
305 print U "$finalword\n";
306 print S "$finalword\n";
307 } else {
308 $untag = $finalword;
309 $untag =~ s/\/\S+$//;
310 print U "$untag\n";
311 print S "$untag\n";
312 }
313 print D $distance, "\n";
314
315 # construct each longer phrase that ends on this word.
316 $phrase = $finalword;
317 $counter = 1;
318
319 foreach $j (1..$max_phrase_length) {
320
321 # make sure we don't try to get a word from "before" the line
322 last if ($j > $i);
323
324 # find the first word of the next phrase
325 $firstword = $words[$i - $j];
326 $phrase = "$firstword $phrase";
327
328 # make sure the phrase is valid
329 if ($stopword_mode) {
330 # the first word may not be a stopword
331 next if $stopword{lc($firstword)};
332 $counter++;
333 } elsif ($brill_mode) {
334 # all words other than the last must match $brill_nj
335 last unless ($firstword =~ /$brill_nj/);
336 $counter++;
337 } elsif ($carl_mode) {
338 # the carl-reg-exp applies
339 if (($firstword =~ /^$VBG$/) || ($firstword =~ /^$NOUN$/)) {
340 $counter++;
341 }
342 next unless (isValidCarlPhrase());
343 }
344
345 # print the phrase
346 if ($stopword_mode) {
347 print U "$phrase\n";
348 print S "$phrase\n";
349 } else {
350 $untag = $phrase;
351 $untag =~ s/\/\S+//g;
352 print U "$untag\n";
353 print S "$untag\n";
354 }
355
356 # print the distance
357 if ($use_all_words_in_word_count) {
358 print D ($distance - $j), "\n";
359 } else {
360 print D ($distance - $counter), "\n";
361 }
362 }
363
364 }
365 }
366
367 close(S);
368 close(U);
369 close(D);
370 close(F);
371
372 $document_size{$document} = $distance; #WILL CHANGE THIS BACK
373 `$perl_command /home/jmt14/gsdl/perllib/Kea-1.1.4/kea-choose-best-phrase.pl $document`;
374}
375
376#print STDERR "\nMake sure we know document sizes...\n";
377
378foreach $document (@documents) {
379 if (!defined($document_size{$document})) {
380 $max = 0;
381 open(D, "<$document.distance");
382 while (<D>) {
383 chomp;
384 if ($_ > $max) { $max = $_; }
385 }
386 close(D);
387 $document_size{$document} = $max;
388 print STDERR "$document has size $document_size{$document}\n";
389 }
390}
391
392
393
394# Calculate document frequencies
395
396print STDERR "Gathering document frequencies...\n";
397
398if (-e "$document_frequency_file") {
399
400 print STDERR "Found document frequencies -- reading them!\n";
401
402 open(F, "<$document_frequency_file");
403
404 $numdocs_in_global_corpus = <F>;
405 chomp($numdocs_in_global_corpus);
406 print STDERR "$numdocs_in_global_corpus documents in frequency file.\n";
407
408 while (<F>) {
409 @data = split(/ +/, $_);
410 $phrase = join(' ', @data[0..$#data - 1]);
411 $df{$phrase} = $data[$#data];
412 }
413 close(F);
414
415} else {
416
417 #print STDERR "Document frequency file not found... creating\n";
418
419 foreach $document (@documents) {
420 undef %seen;
421 foreach $phrase (split(/\n/, `cat $document.stemmed`)) {
422 $df{$phrase} ++ if (!defined($seen{$phrase}));
423 $seen{$phrase} = 1;
424 }
425 }
426 $numdocs_in_global_corpus = $#documents + 1;
427
428 print STDERR "Writing document frequencies to file...\n";
429 open(F, ">$document_frequency_file");
430 print F "$numdocs_in_global_corpus\n";
431 foreach $phrase (keys %df) {
432 @data = split(/ +/, $phrase);
433 # note we keep phrases 1 longer then we need to just in
434 # case we ever want to calculate entropy statistic
435 if ($#data < $max_phrase_length) {
436 print F "$phrase $df{$phrase}\n";
437 }
438 }
439 close(F);
440}
441
442
443# Read the keyword frequency file
444if ($keyword_frequency_file) {
445 print STDERR "Reading keyword frequency file\n";
446
447 open(KF, "<$keyword_frequency_file");
448
449 $size_of_keyword_frequency_file = <KF>;
450 chomp($size_of_keyword_frequency_file);
451 print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n";
452
453 while (<KF>) {
454 @data = split(/\s+/, $_);
455 $phrase = join(' ', @data[0..$#data - 1]);
456 $kf{$phrase} = $data[$#data];
457 }
458 close(KF);
459}
460
461
462
463# What journals are we using?
464if ($Turney) {
465 $journals{"unknown"} = 1;
466 foreach $document (@documents) {
467 ($docnum) = $document =~ /(\d+)\.$suffix/;
468 if ($docnum < 7) {
469 $journals{"JAIHR"} = 1;
470 } elsif ($docnum < 27) {
471 $journals{"Psycoloquy"} = 1;
472 } elsif ($docnum < 29) {
473 $journals{"Neuroscientist"} = 1;
474 } elsif ($docnum < 43) {
475 $journals{"JCAMD"} = 1;
476 } else {
477 $journals{"BBS"} = 1;
478 }
479 }
480}
481
482
483# Write the arff files
484
485print STDERR "Writing ARFF file\n";
486
487open (ARFF, ">$arfffile");
488
489print ARFF "\@relation keyphrase\n\n";
490print ARFF "\@attribute phrase string\n";
491print ARFF "\@attribute document {", join(', ', @documents), "}\n";
492
493if ($Turney) {
494 print ARFF "\@attribute journal {", join(', ', keys(%journals)), "}\n";
495}
496if ($keyword_frequency_file) {
497 print ARFF "\@attribute keyword_freq real\n";
498}
499
500print ARFF "\@attribute tfidf real\n";
501print ARFF "\@attribute first_occurrence real\n";
502
503
504# print ARFF "\@attribute entropy real\n";
505# print ARFF "\@attribute in_first_5_percent real\n";
506# print ARFF "\@attribute last real\n";
507# print ARFF "\@attribute interval_deviation real\n";
508# print ARFF "\@attribute max_cluster_of_$offset_clumping_cutoff real\n";
509# print ARFF "\@attribute term_frequency real\n";
510# print ARFF "\@attribute document_frequency real\n";
511print ARFF "\@attribute class {yes,no}\n";
512
513print ARFF "\n\@data\n";
514
515
516# the number of keyphrases not covered by the arff file
517print STDERR "Writing instances...\n";
518$not_included = 0;
519$document_number = 1;
520
521foreach $document (@documents) {
522 print STDERR " document $document_number: $document\r";
523 $document_number++;
524
525 # load the keyphrases
526 undef %keyphrase;
527
528 $keyphrases = $document;
529 $keyphrases =~ s/$suffix$/key/;
530
531 if (-e $keyphrases) {
532 foreach (split(/\n/, `cat $keyphrases | $perl_command kea-tidy-key-file.pl | stemmer`)) {
533 $keyphrase{$_} = 1;
534 }
535 }
536
537 # Calculate document agregates from stemmed file
538
539 # total frequency in the document
540 undef %tf;
541 # the length of the document
542 $document_size = $document_size{$document};
543
544 # number of occurances in the first 5% of the document
545 # undef %first_5;
546 # last occurance
547 # undef %last;
548 # for standard deviation of distance between occurances
549 # undef %interval_sd;
550 # undef %sumsquare;
551 # undef %sum;
552 # for entropy and Carl (disabled) measures
553 # undef %entropy;
554 # undef %f;
555 # undef %ff;
556 # $total_entropy{$document} = 0;
557 # $total_instances{$document} = 0;
558 # for calculating "offsets" sizes
559 # undef %offsets;
560 # undef %biggest_offsets;
561
562 open(S, "<$document.stemmed");
563 open(D, "<$document.distance");
564
565 while (<S>) {
566 chop($phrase = $_);
567 $tf{$phrase}++;
568
569 #@p = split(/ +/, $phrase);
570 #$left = join(' ', @p[0..$#p - 1]);
571 #$right = join(' ', @p[1..$#p]);
572 #$f{$left}{$phrase} ++;
573 #$f{$right}{$phrase} ++;
574 #$ff{$left} ++;
575 #$ff{$right} ++;
576
577 chop($distance = <D>);
578
579 # $first_5{$phrase} ++ if ($distance / $document_size < 0.05);
580
581 #if (!defined($sum{$phrase})) {
582 # $sum{$phrase} = 0;
583 # $sumsquare{$phrase} = 0;
584 #} else {
585 # $difference = $distance - $last{$phrase};
586 # $sum{$phrase} += $difference;
587 # $sumsquare{$phrase} += ($difference * $difference);
588 #}
589
590 # $last{$phrase} = $distance;
591
592 #if (!defined($biggest_offsets{$phrase})) {
593 # @{$offsets{$phrase}} = ($distance);
594 # $biggest_offsets{$phrase} = 1;
595 #} else {
596 # $cutoff = $distance - $offset_clumping_cutoff;
597 # #print "-- \n$phrase \noffsets: @{$offsets{$phrase}}\n";
598 # push( @{$offsets{$phrase}}, $distance );
599 # while (@{$offsets{$phrase}}[0] < $cutoff) {
600 # shift( @{$offsets{$phrase}} );
601 # }
602 # # print "offsets: @{$offsets{$phrase}}\n";
603 # if ($#{$offsets{$phrase}} >= $biggest_offsets{$phrase}) {
604 # $biggest_offsets{$phrase} = $#{$offsets{$phrase}} + 1;
605 # }
606 #}
607 }
608
609 close(S);
610 close(D);
611 # undef %offsets;
612
613
614 # Calculate complex attribute values
615
616 foreach $phrase (keys(%tf)) {
617
618 # Calculate standard deviation of distance between occurances
619 #$interval_sd{$phrase} =
620 # sqrt( ($sumsquare{$phrase} -
621 # ($sum{$phrase} * $sum{$phrase} / $tf{$phrase}))
622 # / $tf{$phrase} );
623 #undef $sum{$phrase};
624 #undef $sumsquare{$phrase};
625
626
627 # Calculate Entropy
628 #$t = 0;
629 #$entropy = 0;
630 #$entropy = -1 if (!defined($f{$phrase}));
631 #
632 #foreach $superphrase (keys(%{$f{$phrase}})) {
633 # $t += $f{$phrase}{$superphrase};
634 #}
635 #
636 #foreach $superphrase (keys(%{$f{$phrase}})) {
637 # $p = $f{$phrase}{$superphrase} / $t;
638 # $entropy -= $p * log($p) / log(2);
639 #}
640 #
641 #$entropy{$phrase} = $entropy;
642
643 # Calculate Carl statistic
644 #$left = join(' ', @words[0..$#words - 1]);
645 #$right = join(' ', @words[1..$#words]);
646
647 #$carl1 = $f{$left}{$phrase} / $ff{$left};
648 #$carl2 = $f{$right}{$phrase} / $ff{$right};
649 #$carlmin = ($carl1 < $carl2) ? $carl1 : $carl2;
650 #if ($#words == 0) { $carlmin = 1; }
651
652
653 #undef $f{$phrase};
654 #undef $ff{$phrase};
655
656 }
657
658
659
660 # Write the arff file.
661 open(S, "<$document.stemmed");
662 open(U, "<$document.unstemmed");
663 open(D, "<$document.distance");
664
665 undef %seen;
666
667 while (<S>) {
668 chop($phrase = $_);
669 chop($unstemmed = <U>);
670 chop($distance = <D>);
671
672 # only output each phrase once
673 next if (defined($seen{$phrase}));
674 $seen{$phrase} = 1;
675
676 # ignore phrases of more than N words
677 @words = split(/ +/, $phrase);
678 next if ($#words >= $max_phrase_length);
679
680 # ignore likely proper-nouns
681 next if ($unstemmed =~ /^PN\-/);
682
683 # ignore singleton phrases
684 next if (($tf{$phrase} == 1) && $suppress_singleton_phrases);
685
686 # Calculate TFIDF
687 if (!defined($df{$phrase})) {
688 if ($testfile) { $df{$phrase} = 0 }
689 else { $df{$phrase} = 1; }
690 }
691 $tfidf = ($tf{$phrase} / $document_size) *
692 (log(($numdocs_in_global_corpus + $testfile) / ($df{$phrase} + $testfile)) / log(2));
693
694 # Initialise number of words in first 5%
695 # $first_5{$phrase} = 0 if (!defined($first_5{$phrase}));
696
697 # Calculate class
698 $class = defined($keyphrase{$phrase}) ? "yes" : "no";
699
700 # For the Turney corpus, identify the journal
701 ($docnum) = $document =~ /(\d+)\.$suffix/;
702 if (!$Turney || !$docnum) {
703 $journal = "?";
704 } elsif ($docnum < 7) {
705 $journal = "JAIHR";
706 } elsif ($docnum < 27) {
707 $journal = "Psycoloquy";
708 } elsif ($docnum < 29) {
709 $journal = "Neuroscientist";
710 } elsif ($docnum < 43) {
711 $journal = "JCAMD";
712 } else {
713 $journal = "BBS";
714 }
715
716
717 # Write an instance to the arff file
718 print ARFF "'$phrase ($unstemmed)',$document,";
719 if ($Turney) { print ARFF "$journal,"; }
720 if ($keyword_frequency_file) {
721 if ($kf{$phrase}) {
722 if (defined($keyphrase{$phrase}) && $kf_covers_input) {
723 print ARFF ($kf{$phrase} - 1), ",";
724 } else {
725 print ARFF "$kf{$phrase},";
726 }
727 } else {
728 print ARFF "0,";
729 }
730 }
731 printf(ARFF "%.3g,%.3g,%s\n",
732 $tfidf,
733 $distance / $document_size,
734 # $entropy{$phrase},
735 # $first_5{$phrase} / $tf{$phrase},
736 # $last{$phrase} / $document_size,
737 # $interval_sd{$phrase},
738 # $biggest_offsets{$phrase},
739 # $tf{$phrase} / $document_size,
740 # $df{$phrase} + $testfile,
741 $class
742 );
743
744 $keyphrase{$phrase} = 0 if (defined($keyphrase{$phrase}));
745
746 }
747
748 print ARFF "% Begin missing phrases for $document\n";
749 foreach $keyphrase (keys(%keyphrase)) {
750 next if ($keyphrase{$keyphrase} == 0);
751 $not_included ++;
752 print ARFF "'$keyphrase', $document, ";
753 if ($Turney) { print ARFF " ?, "; }
754 if ($keyword_frequency_file) { print ARFF " ?, "; }
755 print ARFF "?, ?, ?\n";
756 }
757 print ARFF "% Finish missing phrases for $document\n";
758
759 close(S);
760 close(U);
761 close(D);
762}
763
764# finish off arff file
765print ARFF "% $not_included key phrases not included\n";
766print "% $not_included key phrases not included\n";
767
768
769print STDERR "k4.pl: $arfffile complete\n\n";
770
771
772
773
774
775# Is the current phrase a valid Carl phrase?
776# The phrase is storedin $phrase.
777
778
779sub isValidCarlPhrase () {
780 @wds = split(/ +/, $phrase);
781 $index = 0;
782 $wd = $wds[$index];
783
784 while ($index < $#wds) {
785 # current word must be a noun, adj, or verb
786 if (($wd =~ /^$NOUN$/) || ($wd =~ /^$ADJ$/) || ($wd =~ /^$VERB$/)) {
787 $index++;
788 $wd = $wds[$index];
789 } else {
790 return 0;
791 }
792
793 # next is an optional incc
794 if ($wd =~ /^$INCC$/) {
795 # it is an incc, so advance one word
796 $index++;
797 $wd = $wds[$index];
798 }
799 }
800
801 # since we can advance two in the loop, it's possible to have
802 # run out of input. If this is the case, then the phrase is
803 # not brill, as there's no room for a final NN or VBG
804 if ($index > $#wds) {
805 return 0;
806 }
807
808 # the last word must be either a noun or a vbg
809 if (($wd =~ /^$VBG$/) || ($wd =~ /^$NOUN$/)) {
810 return 1;
811 } else {
812 return 0;
813 }
814}
Note: See TracBrowser for help on using the repository browser.