source: trunk/gsdl/perllib/Kea-1.1.4/k4.pl@ 9156

Last change on this file since 9156 was 4281, checked in by jrm21, 21 years ago

1) use Getopts::Std instead of getopts.pl
2) Only print informative messages if debug is set
Kea only:
3) only compile stemmer if it isn't already there.
4) remove temp files unless debug is set
k4.pl only:
5) need to move isValidCarlPhrase() before first call

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.0 KB
Line 
1#!/usr/bin/perl -w
2
3# k4.pl
4# Version 1.1
5
6# Kea -- Automatic Keyphrase Extraction
7# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8# Contact [email protected] or [email protected]
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24# Version history
25#
26# 1.0 Ready to rebuild all the data for Witten et. al. 1998
27# 1.0.1 bug: empty @attribute enumerations cause java crashes.
28# 1.0.2 output number of documents found
29# 1.0.3 bug: if -F and !defined($df{phrase}) then divide by zero crash.
30# 1.0.4 Default attributes reduced to phrase, document, tfidf, first, class
31# 1.0.5 Add keyphrase-frequency file option and attribute
32# 1.0.6 keyphrase-frequency file corrects for covering input
33# 1.0.7 use kea-tidy-key-file.pl | stemmer to clean up key files
34# 1.0.8 -t toggle uses .tagged instead of .clauses files
35# 1.0.9 Cleaned up output so it goes to STDERR not STDOUT
36# 1.0.10 Use $perl_command for system-independence
37# 1.0.11 List command line parameters if no arguments are given.
38# 1.1 First Distribution. GPL added. Documentation improved.
39# 1.1.1 Tweaked output a little; no changes to function
40
41print STDERR "k4.pl: the arff file builder for Kea\n";
42
43$gsdlhome = $ENV{'GSDLHOME'};
44
45# Command line options
46# k4.pl [options] <input-directory> <output-arff-file>
47# options are:
48# -abciFs
49# -L max length
50# -f <df file>
51# -S <stopword file>
52
53use Getopt::Std;
54my $opt_i=0;
55my $opt_t=0;
56getopt("abciL:f:FsS:kK:t");
57my $debug=0;
58
59# What files shall we use?
60if (!$ARGV[0] || !$ARGV[1]) {
61 print STDERR "usage: k4.pl [options] <input-directory> <output-arff-file>\n";
62 print STDERR " options: -a stopword mode (default)\n";
63 print STDERR " -b brill-phrase mode (requires -t)\n";
64 print STDERR " -c carl-phrase mode (requires -t)\n";
65 print STDERR " -f [global frequency file]\n";
66 print STDERR " -F global frequency file includes input files\n";
67 print STDERR " -i ignore stopwords in word counts\n";
68 print STDERR " -K [keyword frequency file]\n";
69 print STDERR " -k keyword frequency file includes input files\n";
70 print STDERR " -L [maximum phrase length] in words (default = 3)\n";
71 print STDERR " -S [stopword file]\n";
72 print STDERR " -s keep singleton phrases\n";
73 print STDERR " -t input files are *.tagged (default is *.clauses)\n";
74 die "\n";
75}
76my $directory = $ARGV[0];
77$directory =~ s/\/$//;
78$arfffile = $ARGV[1];
79
80if ($debug) {
81 print STDERR "Input directory: $directory\n";
82 print STDERR "Output Arff file: $arfffile\n";
83}
84
85
86
87
88# Is the current phrase a valid Carl phrase?
89# The phrase is storedin $phrase.
90my ($NOUN, $ADJ, $VERB, $INCC, $VBG); # globals, set below
91
92sub isValidCarlPhrase () {
93 @wds = split(/ +/, $phrase);
94 $index = 0;
95 $wd = $wds[$index];
96
97 while ($index < $#wds) {
98 # current word must be a noun, adj, or verb
99 if (($wd =~ /^$NOUN$/) || ($wd =~ /^$ADJ$/) || ($wd =~ /^$VERB$/)) {
100 $index++;
101 $wd = $wds[$index];
102 } else {
103 return 0;
104 }
105
106 # next is an optional incc
107 if ($wd =~ /^$INCC$/) {
108 # it is an incc, so advance one word
109 $index++;
110 $wd = $wds[$index];
111 }
112 }
113
114 # since we can advance two in the loop, it's possible to have
115 # run out of input. If this is the case, then the phrase is
116 # not brill, as there's no room for a final NN or VBG
117 if ($index > $#wds) {
118 return 0;
119 }
120
121 # the last word must be either a noun or a vbg
122 if (($wd =~ /^$VBG$/) || ($wd =~ /^$NOUN$/)) {
123 return 1;
124 } else {
125 return 0;
126 }
127}
128
129
130
131
132
133
134# Set the maximum phrase length
135if (($opt_L) && ($opt_L =~ /^\d$/) && ($opt_L > 0)) {
136 $max_phrase_length = $opt_L;
137} else {
138 $max_phrase_length = 3;
139}
140
141if ($debug) {
142 print STDERR "Maximum phrase length: $max_phrase_length\n";
143}
144
145# Are we in Stopword mode, Brill mode, or Carl mode?
146if ($opt_b && $opt_c) {
147 print STDERR "Warning: Brill phrase option overriding Carl phrase option.\n";
148
149} elsif ($opt_b) {
150 # Brill phrases
151 if ($debug) {print STDERR "Brill phrase mode\n";}
152 $brill_mode = 1;
153
154 $brill_n = "/(NN|NNS|NNP|NNPS|VBG)\$";
155 $brill_nj = "/(NN|NNS|NNP|NNPS|JJ)\$";
156
157} elsif ($opt_c) {
158 # Carl phrases
159 if ($debug) {print STDERR "Carl phrase mode\n";}
160 $carl_mode = 1;
161
162 # regular expressions used to determine brillity
163 $WORD = "[a-zA-Z0-9]+(\-?)[a-zA-Z0-9]+";
164 $NOUN = "($WORD\/NN[A-Z]*)";
165 $ADJ = "($WORD\/JJ[A-Z]*)";
166 $VBG = "({$WORD}ing\/VBG)";
167 $VBN = "({$WORD}ed\/VBN)";
168
169 $VB = "($WORD\/VB)";
170 $VBP = "($WORD\/VBP)";
171 $VERB = "$VBN|$VB|$VBG|$VBP";
172
173 # The OEDs conjunctions -- Carl's stuff
174 $OED_CONJ = "((after|albeit|although|an|and|as|because|before|but|cos|directly|either|except|for|howbeit|if|immediately|instantly|lest|like|likewise|neither|nevertheless|nisi|nor|notwithstanding|now|only|or|otherwise|provided|providing|qua|since|so|supposing|than|that|tho|though|till|unless|until|when|whencesoever|whenever|whereas|whether|while|whilst|yet)\/CC)";
175
176 # The OED's list of prepositions -- Carl's stuff
177 $OED_PREP = "((abaft aboard about above across afore after against agin along alongside amid amidst among amongst anent around aslant astride at athwart bar barring before behind below beneath beside besides between betwixt beyond but by circa concerning considering despite down during ere except excepting failing for from in inside into less like mid midst minus near neath next nigh nigher nighest notwithstanding oer of off on onto outside over past pending per plus qua re respecting round sans save saving since thro through throughout thru till to touching toward towards tween twixt under underneath unlike until unto up upon versus via vice wanting with within without)\/IN)";
178
179 # The OED based Carl-phrase wordlist
180 $INCC = "$OED_CONJ|$OED_PREP";
181
182
183
184} else {
185 # stopword mode
186 #print STDERR "Stopword mode\n";
187 $stopword_mode = 1;
188
189 $stopword_file = "stopwords";
190 if ($opt_S) {
191 if (-e "$opt_S") {
192 $stopword_file = $opt_S;
193 } else {
194 print STDERR "Can't find stopword file: $opt_S\n";
195 }
196 }
197 if ($debug) {
198 print STDERR "Using stopword file: $stopword_file\n";
199 }
200
201 # read the stopwords
202 open(S, "<$stopword_file");
203 while (<S>) {
204 chomp;
205 $stopword{lc($_)} = 1;
206 }
207 close(S);
208}
209
210
211# Should we ignore stopwords in word counts?
212if ($opt_i) {
213 print STDERR "Ignoring stopwords in word counts.\n";
214 $use_all_words_in_word_count = 0;
215} else {
216 #print STDERR "Using all words (including stopwords) in word count (default).\n";
217 $use_all_words_in_word_count = 1;
218}
219
220
221# Which global-frequency file are we using
222if ($opt_f) {
223 $document_frequency_file = $opt_f;
224} else {
225 $document_frequency_file = "document-frequencies";
226}
227
228if ($debug) {
229 print STDERR "Document frequency file: $document_frequency_file\n";
230}
231
232
233# How is the global-frequency file created?
234# Note: $testfile is added to the df figure for each phrase.
235# If the current document is not part of the document frequencies
236# file then the df figure needs to be increased by 1 to allow
237# for the fact that the phrase also occurs in the the current file.
238
239if ($opt_F && (-e "$document_frequency_file")) {
240 if ($debug) {
241 print STDERR "Text files are covered by specified document "
242 . "frequency file\n";
243 }
244 $testfile = 0;
245} elsif ($opt_F) {
246 die "Document frequency file doesn't exist, -F option impossible\n";
247} elsif (-e "$document_frequency_file") {
248 if ($debug) {
249 print STDERR "Text files are not covered by document frequency file\n";
250 }
251 $testfile = 1;
252} else {
253 print STDERR "Document frequency file does not exist: create from training data\n";
254 $testfile = 0;
255}
256
257# Which keyword-frequency file are we using
258if ($opt_K && (-e $opt_K)) {
259 $keyword_frequency_file = $opt_K;
260 if ($debug) {
261 print STDERR "Keyword frequency file: $keyword_frequency_file\n";
262 }
263 if ($opt_k && $opt_k) {
264 if ($debug) {
265 print STDERR "Keyword frequency file covers input files\n";
266 }
267 $kf_covers_input = 1;
268 } else {
269 if ($debug) {
270 print STDERR "Keyword frequency is independent of input\n";
271 }
272 $kf_covers_input = 0;
273 }
274} elsif ($opt_K) {
275 die "Keyword frequency file doesn't exist, -K option impossible\n";
276} else {
277 $keyword_frequency_file = "";
278 if ($debug) {print STDERR "No keyword frequency file\n";}
279}
280
281
282# Should we show singleton phrases?
283if ($opt_s && $opt_s) {
284 $suppress_singleton_phrases = 0;
285 print STDERR "Including singleton phrases.\n"
286} else {
287 $suppress_singleton_phrases = 1;
288 if ($debug) {print STDERR "Ignoring singleton phrases (default).\n"}
289}
290
291
292# Do we look for *.tagged or *.clauses?
293$suffix = "clauses";
294if ($opt_t) {
295 $suffix = "tagged";
296}
297
298if ($debug) {
299 print STDERR "Input file suffix: $suffix\n";
300}
301
302# How to run scripts etc
303my $perl_command = "perl";
304my $perl_args = ("-w"); # a list
305
306# Are we using Turney's data
307$Turney = 0;
308
309
310
311
312# Start keyphrase extraction
313
314# find the documents
315@documents = split(/\s+/, `find $directory/ -name "*.$suffix"`);
316if (!$documents[0]) {
317 die "\nk4.pl error: no documents found.";
318} else {
319 print STDERR "Producing keyphrases for ", $#documents + 1, " documents\n";
320}
321
322if ($debug) {print STDERR "Finding candidate phrases...\n";}
323$document_number = 1;
324
325foreach $document (@documents) {
326 if ($debug) {print STDERR " document $document_number: $document\n";}
327 $document_number++;
328
329 # if we've already produced this file, skip it.
330 next if (-e "$document.stemmed");
331
332 open(F, "<$document");
333
334 #open(S, "| ./stemmer > $document.stemmed");
335 open(S, "| $gsdlhome/perllib/Kea-1.1.4/stemmer > $document.stemmed");
336 open(U, ">$document.unstemmed");
337 open(D, ">$document.distance");
338
339 $distance = 0;
340
341 while (<F>) {
342 chomp;
343
344 # if we're in stopword mode, throw away the tags
345 if ($stopword_mode) {
346 s/\/\S+//g;
347 }
348
349 #work through each line
350 @words = split(/ +/, $_);
351
352 foreach $i (0..$#words) {
353 $finalword = $words[$i];
354
355 # calculate diatance from start (-i not set)
356 if ($use_all_words_in_word_count) {
357 $distance ++;
358 }
359
360 # make sure the last word in the phrase is okay
361 if ($stopword_mode) {
362 # the last word must not be a stopword
363 next if $stopword{lc($finalword)};
364 } elsif ($brill_mode) {
365 # the last word must match ?
366 next unless ($finalword =~ /$brill_n/);
367 } elsif ($carl_mode) {
368 # the last word must be either a noun or a vbg
369 next unless (($finalword =~ /^$VBG$/) ||
370 ($finalword =~ /^$NOUN$/));
371 }
372
373 # calculate distance from start in -i mode
374 # (this is still flawed for phrases containing stopwords)
375 if (!$use_all_words_in_word_count) {
376 $distance ++;
377 }
378
379 # print $finalword as a phrase of length 1
380 if ($stopword_mode) {
381 print U "$finalword\n";
382 print S "$finalword\n";
383 } else {
384 $untag = $finalword;
385 $untag =~ s/\/\S+$//;
386 print U "$untag\n";
387 print S "$untag\n";
388 }
389 print D $distance, "\n";
390
391 # construct each longer phrase that ends on this word.
392 $phrase = $finalword;
393 $counter = 1;
394
395 foreach $j (1..$max_phrase_length) {
396
397 # make sure we don't try to get a word from "before" the line
398 last if ($j > $i);
399
400 # find the first word of the next phrase
401 $firstword = $words[$i - $j];
402 $phrase = "$firstword $phrase";
403
404 # make sure the phrase is valid
405 if ($stopword_mode) {
406 # the first word may not be a stopword
407 next if $stopword{lc($firstword)};
408 $counter++;
409 } elsif ($brill_mode) {
410 # all words other than the last must match $brill_nj
411 last unless ($firstword =~ /$brill_nj/);
412 $counter++;
413 } elsif ($carl_mode) {
414 # the carl-reg-exp applies
415 if (($firstword =~ /^$VBG$/) || ($firstword =~ /^$NOUN$/)) {
416 $counter++;
417 }
418 next unless (isValidCarlPhrase());
419 }
420
421 # print the phrase
422 if ($stopword_mode) {
423 print U "$phrase\n";
424 print S "$phrase\n";
425 } else {
426 $untag = $phrase;
427 $untag =~ s/\/\S+//g;
428 print U "$untag\n";
429 print S "$untag\n";
430 }
431
432 # print the distance
433 if ($use_all_words_in_word_count) {
434 print D ($distance - $j), "\n";
435 } else {
436 print D ($distance - $counter), "\n";
437 }
438 }
439
440 }
441 }
442
443 close(S);
444 close(U);
445 close(D);
446 close(F);
447
448 $document_size{$document} = $distance; #WILL CHANGE THIS BACK
449
450 system("$perl_command", ($perl_args, "$gsdlhome/perllib/Kea-1.1.4/kea-choose-best-phrase.pl", "$document"));
451}
452
453#print STDERR "\nMake sure we know document sizes...\n";
454
455foreach $document (@documents) {
456 if (!defined($document_size{$document})) {
457 $max = 0;
458 open(D, "<$document.distance");
459 while (<D>) {
460 chomp;
461 if ($_ > $max) { $max = $_; }
462 }
463 close(D);
464 $document_size{$document} = $max;
465 print STDERR "$document has size $document_size{$document}\n";
466 }
467}
468
469
470
471# Calculate document frequencies
472
473if ($debug) {print STDERR "Gathering document frequencies...\n";}
474
475if (-e "$document_frequency_file") {
476 if ($debug) {
477 print STDERR "Found document frequencies -- reading them!\n";
478 }
479 open(F, "<$document_frequency_file");
480
481 $numdocs_in_global_corpus = <F>;
482 chomp($numdocs_in_global_corpus);
483 print STDERR "$numdocs_in_global_corpus documents in frequency file.\n";
484
485 while (<F>) {
486 @data = split(/ +/, $_);
487 $phrase = join(' ', @data[0..$#data - 1]);
488 $df{$phrase} = $data[$#data];
489 }
490 close(F);
491
492} else {
493
494 print STDERR "Document frequency file not found... creating\n";
495
496 foreach $document (@documents) {
497 undef %seen;
498 foreach $phrase (split(/\n/, `cat $document.stemmed`)) {
499 $df{$phrase} ++ if (!defined($seen{$phrase}));
500 $seen{$phrase} = 1;
501 }
502 }
503 $numdocs_in_global_corpus = $#documents + 1;
504
505 if ($debug) {print STDERR "Writing document frequencies to file...\n";}
506 open(F, ">$document_frequency_file");
507 print F "$numdocs_in_global_corpus\n";
508 foreach $phrase (keys %df) {
509 @data = split(/ +/, $phrase);
510 # note we keep phrases 1 longer then we need to just in
511 # case we ever want to calculate entropy statistic
512 if ($#data < $max_phrase_length) {
513 print F "$phrase $df{$phrase}\n";
514 }
515 }
516 close(F);
517}
518
519
520# Read the keyword frequency file
521if ($keyword_frequency_file) {
522 if ($debug) {print STDERR "Reading keyword frequency file\n";}
523
524 open(KF, "<$keyword_frequency_file");
525
526 $size_of_keyword_frequency_file = <KF>;
527 chomp($size_of_keyword_frequency_file);
528 if ($debug) {
529 print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n";
530 }
531
532 while (<KF>) {
533 @data = split(/\s+/, $_);
534 $phrase = join(' ', @data[0..$#data - 1]);
535 $kf{$phrase} = $data[$#data];
536 }
537 close(KF);
538}
539
540
541
542# What journals are we using?
543if ($Turney) {
544 $journals{"unknown"} = 1;
545 foreach $document (@documents) {
546 ($docnum) = $document =~ /(\d+)\.$suffix/;
547 if ($docnum < 7) {
548 $journals{"JAIHR"} = 1;
549 } elsif ($docnum < 27) {
550 $journals{"Psycoloquy"} = 1;
551 } elsif ($docnum < 29) {
552 $journals{"Neuroscientist"} = 1;
553 } elsif ($docnum < 43) {
554 $journals{"JCAMD"} = 1;
555 } else {
556 $journals{"BBS"} = 1;
557 }
558 }
559}
560
561
562# Write the arff files
563
564if ($debug) {print STDERR "Writing ARFF file\n";}
565
566open (ARFF, ">$arfffile");
567
568print ARFF "\@relation keyphrase\n\n";
569print ARFF "\@attribute phrase string\n";
570print ARFF "\@attribute document {", join(', ', @documents), "}\n";
571
572if ($Turney) {
573 print ARFF "\@attribute journal {", join(', ', keys(%journals)), "}\n";
574}
575if ($keyword_frequency_file) {
576 print ARFF "\@attribute keyword_freq real\n";
577}
578
579print ARFF "\@attribute tfidf real\n";
580print ARFF "\@attribute first_occurrence real\n";
581
582
583# print ARFF "\@attribute entropy real\n";
584# print ARFF "\@attribute in_first_5_percent real\n";
585# print ARFF "\@attribute last real\n";
586# print ARFF "\@attribute interval_deviation real\n";
587# print ARFF "\@attribute max_cluster_of_$offset_clumping_cutoff real\n";
588# print ARFF "\@attribute term_frequency real\n";
589# print ARFF "\@attribute document_frequency real\n";
590print ARFF "\@attribute class {yes,no}\n";
591
592print ARFF "\n\@data\n";
593
594
595# the number of keyphrases not covered by the arff file
596if ($debug) {print STDERR "Writing instances...\n";}
597$not_included = 0;
598$document_number = 1;
599
600foreach $document (@documents) {
601 if ($debug) {print STDERR " document $document_number: $document\n";}
602 $document_number++;
603
604 # load the keyphrases
605 undef %keyphrase;
606
607 $keyphrases = $document;
608 $keyphrases =~ s/$suffix$/key/;
609
610 if (-e $keyphrases) {
611 foreach (split(/\n/, `cat $keyphrases | $perl_command kea-tidy-key-file.pl | stemmer`)) {
612 $keyphrase{$_} = 1;
613 }
614 }
615
616 # Calculate document agregates from stemmed file
617
618 # total frequency in the document
619 undef %tf;
620 # the length of the document
621 $document_size = $document_size{$document};
622
623 # number of occurances in the first 5% of the document
624 # undef %first_5;
625 # last occurance
626 # undef %last;
627 # for standard deviation of distance between occurances
628 # undef %interval_sd;
629 # undef %sumsquare;
630 # undef %sum;
631 # for entropy and Carl (disabled) measures
632 # undef %entropy;
633 # undef %f;
634 # undef %ff;
635 # $total_entropy{$document} = 0;
636 # $total_instances{$document} = 0;
637 # for calculating "offsets" sizes
638 # undef %offsets;
639 # undef %biggest_offsets;
640
641 open(S, "<$document.stemmed");
642 open(D, "<$document.distance");
643
644 while (<S>) {
645 chop($phrase = $_);
646 $tf{$phrase}++;
647
648 #@p = split(/ +/, $phrase);
649 #$left = join(' ', @p[0..$#p - 1]);
650 #$right = join(' ', @p[1..$#p]);
651 #$f{$left}{$phrase} ++;
652 #$f{$right}{$phrase} ++;
653 #$ff{$left} ++;
654 #$ff{$right} ++;
655
656 chop($distance = <D>);
657
658 # $first_5{$phrase} ++ if ($distance / $document_size < 0.05);
659
660 #if (!defined($sum{$phrase})) {
661 # $sum{$phrase} = 0;
662 # $sumsquare{$phrase} = 0;
663 #} else {
664 # $difference = $distance - $last{$phrase};
665 # $sum{$phrase} += $difference;
666 # $sumsquare{$phrase} += ($difference * $difference);
667 #}
668
669 # $last{$phrase} = $distance;
670
671 #if (!defined($biggest_offsets{$phrase})) {
672 # @{$offsets{$phrase}} = ($distance);
673 # $biggest_offsets{$phrase} = 1;
674 #} else {
675 # $cutoff = $distance - $offset_clumping_cutoff;
676 # #print "-- \n$phrase \noffsets: @{$offsets{$phrase}}\n";
677 # push( @{$offsets{$phrase}}, $distance );
678 # while (@{$offsets{$phrase}}[0] < $cutoff) {
679 # shift( @{$offsets{$phrase}} );
680 # }
681 # # print "offsets: @{$offsets{$phrase}}\n";
682 # if ($#{$offsets{$phrase}} >= $biggest_offsets{$phrase}) {
683 # $biggest_offsets{$phrase} = $#{$offsets{$phrase}} + 1;
684 # }
685 #}
686 }
687
688 close(S);
689 close(D);
690 # undef %offsets;
691
692
693 # Calculate complex attribute values
694
695 foreach $phrase (keys(%tf)) {
696
697 # Calculate standard deviation of distance between occurances
698 #$interval_sd{$phrase} =
699 # sqrt( ($sumsquare{$phrase} -
700 # ($sum{$phrase} * $sum{$phrase} / $tf{$phrase}))
701 # / $tf{$phrase} );
702 #undef $sum{$phrase};
703 #undef $sumsquare{$phrase};
704
705
706 # Calculate Entropy
707 #$t = 0;
708 #$entropy = 0;
709 #$entropy = -1 if (!defined($f{$phrase}));
710 #
711 #foreach $superphrase (keys(%{$f{$phrase}})) {
712 # $t += $f{$phrase}{$superphrase};
713 #}
714 #
715 #foreach $superphrase (keys(%{$f{$phrase}})) {
716 # $p = $f{$phrase}{$superphrase} / $t;
717 # $entropy -= $p * log($p) / log(2);
718 #}
719 #
720 #$entropy{$phrase} = $entropy;
721
722 # Calculate Carl statistic
723 #$left = join(' ', @words[0..$#words - 1]);
724 #$right = join(' ', @words[1..$#words]);
725
726 #$carl1 = $f{$left}{$phrase} / $ff{$left};
727 #$carl2 = $f{$right}{$phrase} / $ff{$right};
728 #$carlmin = ($carl1 < $carl2) ? $carl1 : $carl2;
729 #if ($#words == 0) { $carlmin = 1; }
730
731
732 #undef $f{$phrase};
733 #undef $ff{$phrase};
734
735 }
736
737
738
739 # Write the arff file.
740 open(S, "<$document.stemmed");
741 open(U, "<$document.unstemmed");
742 open(D, "<$document.distance");
743
744 undef %seen;
745
746 while (<S>) {
747 chop($phrase = $_);
748 chop($unstemmed = <U>);
749 chop($distance = <D>);
750
751 # only output each phrase once
752 next if (defined($seen{$phrase}));
753 $seen{$phrase} = 1;
754
755 # ignore phrases of more than N words
756 @words = split(/ +/, $phrase);
757 next if ($#words >= $max_phrase_length);
758
759 # ignore likely proper-nouns
760 next if ($unstemmed =~ /^PN\-/);
761
762 # ignore singleton phrases
763 next if (($tf{$phrase} == 1) && $suppress_singleton_phrases);
764
765 # Calculate TFIDF
766 if (!defined($df{$phrase})) {
767 if ($testfile) { $df{$phrase} = 0 }
768 else { $df{$phrase} = 1; }
769 }
770 $tfidf = ($tf{$phrase} / $document_size) *
771 (log(($numdocs_in_global_corpus + $testfile) / ($df{$phrase} + $testfile)) / log(2));
772
773 # Initialise number of words in first 5%
774 # $first_5{$phrase} = 0 if (!defined($first_5{$phrase}));
775
776 # Calculate class
777 $class = defined($keyphrase{$phrase}) ? "yes" : "no";
778
779 # For the Turney corpus, identify the journal
780 ($docnum) = $document =~ /(\d+)\.$suffix/;
781 if (!$Turney || !$docnum) {
782 $journal = "?";
783 } elsif ($docnum < 7) {
784 $journal = "JAIHR";
785 } elsif ($docnum < 27) {
786 $journal = "Psycoloquy";
787 } elsif ($docnum < 29) {
788 $journal = "Neuroscientist";
789 } elsif ($docnum < 43) {
790 $journal = "JCAMD";
791 } else {
792 $journal = "BBS";
793 }
794
795
796 # Write an instance to the arff file
797 print ARFF "'$phrase ($unstemmed)',$document,";
798 if ($Turney) { print ARFF "$journal,"; }
799 if ($keyword_frequency_file) {
800 if ($kf{$phrase}) {
801 if (defined($keyphrase{$phrase}) && $kf_covers_input) {
802 print ARFF ($kf{$phrase} - 1), ",";
803 } else {
804 print ARFF "$kf{$phrase},";
805 }
806 } else {
807 print ARFF "0,";
808 }
809 }
810 printf(ARFF "%.3g,%.3g,%s\n",
811 $tfidf,
812 $distance / $document_size,
813 # $entropy{$phrase},
814 # $first_5{$phrase} / $tf{$phrase},
815 # $last{$phrase} / $document_size,
816 # $interval_sd{$phrase},
817 # $biggest_offsets{$phrase},
818 # $tf{$phrase} / $document_size,
819 # $df{$phrase} + $testfile,
820 $class
821 );
822
823 $keyphrase{$phrase} = 0 if (defined($keyphrase{$phrase}));
824
825 }
826
827 print ARFF "% Begin missing phrases for $document\n";
828 foreach $keyphrase (keys(%keyphrase)) {
829 next if ($keyphrase{$keyphrase} == 0);
830 $not_included ++;
831 print ARFF "'$keyphrase', $document, ";
832 if ($Turney) { print ARFF " ?, "; }
833 if ($keyword_frequency_file) { print ARFF " ?, "; }
834 print ARFF "?, ?, ?\n";
835 }
836 print ARFF "% Finish missing phrases for $document\n";
837
838 close(S);
839 close(U);
840 close(D);
841}
842
843# finish off arff file
844print ARFF "% $not_included key phrases not included\n";
845print "% $not_included key phrases not included\n";
846
847
848if ($debug) {print STDERR "k4.pl: $arfffile complete\n\n";}
Note: See TracBrowser for help on using the repository browser.