#!/usr/bin/perl -w # k4.pl # Version 1.1 # Kea -- Automatic Keyphrase Extraction # Copyright 1998-1999 by Gordon Paynter and Eibe Frank # Contact gwp@cs.waikato.ac.nz or eibe@cs.waikato.ac.nz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # Version history # # 1.0 Ready to rebuild all the data for Witten et. al. 1998 # 1.0.1 bug: empty @attribute enumerations cause java crashes. # 1.0.2 output number of documents found # 1.0.3 bug: if -F and !defined($df{phrase}) then divide by zero crash. # 1.0.4 Default attributes reduced to phrase, document, tfidf, first, class # 1.0.5 Add keyphrase-frequency file option and attribute # 1.0.6 keyphrase-frequency file corrects for covering input # 1.0.7 use kea-tidy-key-file.pl | stemmer to clean up key files # 1.0.8 -t toggle uses .tagged instead of .clauses files # 1.0.9 Cleaned up output so it goes to STDERR not STDOUT # 1.0.10 Use $perl_command for system-independence # 1.0.11 List command line parameters if no arguments are given. # 1.1 First Distribution. GPL added. Documentation improved. # 1.1.1 Tweaked output a little; no changes to function print STDERR "\nk4.pl: the arff file builder for Kea\n"; $gsdlhome = $ENV{'GSDLHOME'}; # Command line options # k4.pl [options] # options are: # -abciFs # -L max length # -f # -S require("getopts.pl"); &Getopts("abciL:f:FsS:kK:t"); # What files shall we use? if (!$ARGV[0] || !$ARGV[1]) { print STDERR "usage: k4.pl [options] \n"; print STDERR " options: -a stopword mode (default)\n"; print STDERR " -b brill-phrase mode (requires -t)\n"; print STDERR " -c carl-phrase mode (requires -t)\n"; print STDERR " -f [global frequency file]\n"; print STDERR " -F global frequency file includes input files\n"; print STDERR " -i ignore stopwords in word counts\n"; print STDERR " -K [keyword frequency file]\n"; print STDERR " -k keyword frequency file includes input files\n"; print STDERR " -L [maximum phrase length] in words (default = 3)\n"; print STDERR " -S [stopword file]\n"; print STDERR " -s keep singleton phrases\n"; print STDERR " -t input files are *.tagged (default is *.clauses)\n"; die "\n"; } $directory = $ARGV[0]; $directory =~ s/\/$//; $arfffile = $ARGV[1]; print STDERR "Input directory: $directory\n"; print STDERR "Output Arff file: $arfffile\n"; # Set the maximum phrase length if (($opt_L) && ($opt_L =~ /^\d$/) && ($opt_L > 0)) { $max_phrase_length = $opt_L; } else { $max_phrase_length = 3; } print STDERR "Maximum phrase length: $max_phrase_length\n"; # Are we in Stopword mode, Brill mode, or Carl mode? if ($opt_b && $opt_c) { print STDERR "Warning: Brill phrase option overriding Carl phrase option.\n"; } elsif ($opt_b) { # Brill phrases print STDERR "Brill phrase mode\n"; $brill_mode = 1; $brill_n = "/(NN|NNS|NNP|NNPS|VBG)\$"; $brill_nj = "/(NN|NNS|NNP|NNPS|JJ)\$"; } elsif ($opt_c) { # Carl phrases print STDERR "Carl phrase mode\n"; $carl_mode = 1; # regular expressions used to determine brillity $WORD = "[a-zA-Z0-9]+(\-?)[a-zA-Z0-9]+"; $NOUN = "($WORD\/NN[A-Z]*)"; $ADJ = "($WORD\/JJ[A-Z]*)"; $VBG = "($WORD\ing\/VBG)"; $VBN = "($WORD\ed\/VBN)"; $VB = "($WORD\/VB)"; $VBP = "($WORD\/VBP)"; $VERB = "$VBN|$VB|$VBG|$VBP"; # The OEDs conjunctions -- Carl's stuff $OED_CONJ = "((after|albeit|although|an|and|as|because|before|but|cos|directly|either|except|for|howbeit|if|immediately|instantly|lest|like|likewise|neither|nevertheless|nisi|nor|notwithstanding|now|only|or|otherwise|provided|providing|qua|since|so|supposing|than|that|tho|though|till|unless|until|when|whencesoever|whenever|whereas|whether|while|whilst|yet)\/CC)"; # The OED's list of prepositions -- Carl's stuff $OED_PREP = "((abaft aboard about above across afore after against agin along alongside amid amidst among amongst anent around aslant astride at athwart bar barring before behind below beneath beside besides between betwixt beyond but by circa concerning considering despite down during ere except excepting failing for from in inside into less like mid midst minus near neath next nigh nigher nighest notwithstanding oer of off on onto outside over past pending per plus qua re respecting round sans save saving since thro through throughout thru till to touching toward towards tween twixt under underneath unlike until unto up upon versus via vice wanting with within without)\/IN)"; # The OED based Carl-phrase wordlist $INCC = "$OED_CONJ|$OED_PREP"; } else { # stopword mode #print STDERR "Stopword mode\n"; $stopword_mode = 1; $stopword_file = "stopwords"; if ($opt_S) { if (-e "$opt_S") { $stopword_file = $opt_S; } else { print STDERR "Can't find stopword file: $opt_S\n"; } } print STDERR "Using stopword file: $stopword_file\n"; # read the stopwords open(S, "<$stopword_file"); while () { chomp; $stopword{lc($_)} = 1; } close(S); } # Should we ignore stopwords in word counts? if ($opt_i && $opt_i) { print STDERR "Ignoring stopwords in word counts.\n"; $use_all_words_in_word_count = 0; } else { #print STDERR "Using all words (including stopwords) in word count (default).\n"; $use_all_words_in_word_count = 1; } # Which global-frequency file are we using if ($opt_f) { $document_frequency_file = $opt_f; } else { $document_frequency_file = "document-frequencies"; } print STDERR "Document frequency file: $document_frequency_file\n"; # How is the global-frequency file created? # Note: $testfile is added to the df figure for each phrase. # If the current document is not part of the document frequencies # file then the df figure needs to be increased by 1 to allow # for the fact that the phrase also occurs in the the current file. if ($opt_F && (-e "$document_frequency_file")) { print STDERR "Text files are covered by specified document frequency file\n"; $testfile = 0; } elsif ($opt_F) { die "Document frequency file doesn't exist, -F option impossible\n"; } elsif (-e "$document_frequency_file") { print STDERR "Text files are not covered by document frequency file\n"; $testfile = 1; } else { print STDERR "Document frequency file does not exist: create from training data\n"; $testfile = 0; } # Which keyword-frequency file are we using if ($opt_K && (-e $opt_K)) { $keyword_frequency_file = $opt_K; print STDERR "Keyword frequency file: $keyword_frequency_file\n"; if ($opt_k && $opt_k) { print STDERR "Keyword frequency file covers input files\n"; $kf_covers_input = 1; } else { print STDERR "Keyword frequency is independent of input\n"; $kf_covers_input = 0; } } elsif ($opt_K) { die "Keyword frequency file doesn't exist, -K option impossible\n"; } else { $keyword_frequency_file = ""; print STDERR "No keyword frequency file\n"; } # Should we show singleton phrases? if ($opt_s && $opt_s) { $suppress_singleton_phrases = 0; print STDERR "Including singleton phrases.\n" } else { $suppress_singleton_phrases = 1; print STDERR "Ignoring singleton phrases (default).\n" } # Do we look for *.tagged or *.clauses? $suffix = "clauses"; if ($opt_t && $opt_t) { $suffix = "tagged"; } print STDERR "Input file suffix: $suffix\n"; # How to run scripts etc $perl_command = "perl -w"; # Are we using Turney's data $Turney = 0; # Start keyphrase extraction # find the documents @documents = split(/\s+/, `find $directory/ -name "*.$suffix"`); if (!$documents[0]) { die "\nk4.pl error: no documents found."; } else { print STDERR "\nProducing keyphrases for ", $#documents + 1, " documents\n"; } print STDERR "Finding candidate phrases...\n"; $document_number = 1; foreach $document (@documents) { print STDERR " document $document_number: $document\r"; $document_number++; # if we've already produced this file, skip it. next if (-e "$document.stemmed"); open(F, "<$document"); #open(S, "| ./stemmer > $document.stemmed"); open(S, "| $gsdlhome/perllib/Kea-1.1.4/stemmer > $document.stemmed"); open(U, ">$document.unstemmed"); open(D, ">$document.distance"); $distance = 0; while () { chomp; # if we're in stopword mode, throw away the tags if ($stopword_mode) { s/\/\S+//g; } #work through each line @words = split(/ +/, $_); foreach $i (0..$#words) { $finalword = $words[$i]; # calculate diatance from start (-i not set) if ($use_all_words_in_word_count) { $distance ++; } # make sure the last word in the phrase is okay if ($stopword_mode) { # the last word must not be a stopword next if $stopword{lc($finalword)}; } elsif ($brill_mode) { # the last word must match ? next unless ($finalword =~ /$brill_n/); } elsif ($carl_mode) { # the last word must be either a noun or a vbg next unless (($finalword =~ /^$VBG$/) || ($finalword =~ /^$NOUN$/)); } # calculate distance from start in -i mode # (this is still flawed for phrases containing stopwords) if (!$use_all_words_in_word_count) { $distance ++; } # print $finalword as a phrase of length 1 if ($stopword_mode) { print U "$finalword\n"; print S "$finalword\n"; } else { $untag = $finalword; $untag =~ s/\/\S+$//; print U "$untag\n"; print S "$untag\n"; } print D $distance, "\n"; # construct each longer phrase that ends on this word. $phrase = $finalword; $counter = 1; foreach $j (1..$max_phrase_length) { # make sure we don't try to get a word from "before" the line last if ($j > $i); # find the first word of the next phrase $firstword = $words[$i - $j]; $phrase = "$firstword $phrase"; # make sure the phrase is valid if ($stopword_mode) { # the first word may not be a stopword next if $stopword{lc($firstword)}; $counter++; } elsif ($brill_mode) { # all words other than the last must match $brill_nj last unless ($firstword =~ /$brill_nj/); $counter++; } elsif ($carl_mode) { # the carl-reg-exp applies if (($firstword =~ /^$VBG$/) || ($firstword =~ /^$NOUN$/)) { $counter++; } next unless (isValidCarlPhrase()); } # print the phrase if ($stopword_mode) { print U "$phrase\n"; print S "$phrase\n"; } else { $untag = $phrase; $untag =~ s/\/\S+//g; print U "$untag\n"; print S "$untag\n"; } # print the distance if ($use_all_words_in_word_count) { print D ($distance - $j), "\n"; } else { print D ($distance - $counter), "\n"; } } } } close(S); close(U); close(D); close(F); $document_size{$document} = $distance; #WILL CHANGE THIS BACK `$perl_command /home/jmt14/gsdl/perllib/Kea-1.1.4/kea-choose-best-phrase.pl $document`; } #print STDERR "\nMake sure we know document sizes...\n"; foreach $document (@documents) { if (!defined($document_size{$document})) { $max = 0; open(D, "<$document.distance"); while () { chomp; if ($_ > $max) { $max = $_; } } close(D); $document_size{$document} = $max; print STDERR "$document has size $document_size{$document}\n"; } } # Calculate document frequencies print STDERR "Gathering document frequencies...\n"; if (-e "$document_frequency_file") { print STDERR "Found document frequencies -- reading them!\n"; open(F, "<$document_frequency_file"); $numdocs_in_global_corpus = ; chomp($numdocs_in_global_corpus); print STDERR "$numdocs_in_global_corpus documents in frequency file.\n"; while () { @data = split(/ +/, $_); $phrase = join(' ', @data[0..$#data - 1]); $df{$phrase} = $data[$#data]; } close(F); } else { print STDERR "Document frequency file not found... creating\n"; foreach $document (@documents) { undef %seen; foreach $phrase (split(/\n/, `cat $document.stemmed`)) { $df{$phrase} ++ if (!defined($seen{$phrase})); $seen{$phrase} = 1; } } $numdocs_in_global_corpus = $#documents + 1; print STDERR "Writing document frequencies to file...\n"; open(F, ">$document_frequency_file"); print F "$numdocs_in_global_corpus\n"; foreach $phrase (keys %df) { @data = split(/ +/, $phrase); # note we keep phrases 1 longer then we need to just in # case we ever want to calculate entropy statistic if ($#data < $max_phrase_length) { print F "$phrase $df{$phrase}\n"; } } close(F); } # Read the keyword frequency file if ($keyword_frequency_file) { print STDERR "Reading keyword frequency file\n"; open(KF, "<$keyword_frequency_file"); $size_of_keyword_frequency_file = ; chomp($size_of_keyword_frequency_file); print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n"; while () { @data = split(/\s+/, $_); $phrase = join(' ', @data[0..$#data - 1]); $kf{$phrase} = $data[$#data]; } close(KF); } # What journals are we using? if ($Turney) { $journals{"unknown"} = 1; foreach $document (@documents) { ($docnum) = $document =~ /(\d+)\.$suffix/; if ($docnum < 7) { $journals{"JAIHR"} = 1; } elsif ($docnum < 27) { $journals{"Psycoloquy"} = 1; } elsif ($docnum < 29) { $journals{"Neuroscientist"} = 1; } elsif ($docnum < 43) { $journals{"JCAMD"} = 1; } else { $journals{"BBS"} = 1; } } } # Write the arff files print STDERR "Writing ARFF file\n"; open (ARFF, ">$arfffile"); print ARFF "\@relation keyphrase\n\n"; print ARFF "\@attribute phrase string\n"; print ARFF "\@attribute document {", join(', ', @documents), "}\n"; if ($Turney) { print ARFF "\@attribute journal {", join(', ', keys(%journals)), "}\n"; } if ($keyword_frequency_file) { print ARFF "\@attribute keyword_freq real\n"; } print ARFF "\@attribute tfidf real\n"; print ARFF "\@attribute first_occurrence real\n"; # print ARFF "\@attribute entropy real\n"; # print ARFF "\@attribute in_first_5_percent real\n"; # print ARFF "\@attribute last real\n"; # print ARFF "\@attribute interval_deviation real\n"; # print ARFF "\@attribute max_cluster_of_$offset_clumping_cutoff real\n"; # print ARFF "\@attribute term_frequency real\n"; # print ARFF "\@attribute document_frequency real\n"; print ARFF "\@attribute class {yes,no}\n"; print ARFF "\n\@data\n"; # the number of keyphrases not covered by the arff file print STDERR "Writing instances...\n"; $not_included = 0; $document_number = 1; foreach $document (@documents) { print STDERR " document $document_number: $document\r"; $document_number++; # load the keyphrases undef %keyphrase; $keyphrases = $document; $keyphrases =~ s/$suffix$/key/; if (-e $keyphrases) { foreach (split(/\n/, `cat $keyphrases | $perl_command kea-tidy-key-file.pl | stemmer`)) { $keyphrase{$_} = 1; } } # Calculate document agregates from stemmed file # total frequency in the document undef %tf; # the length of the document $document_size = $document_size{$document}; # number of occurances in the first 5% of the document # undef %first_5; # last occurance # undef %last; # for standard deviation of distance between occurances # undef %interval_sd; # undef %sumsquare; # undef %sum; # for entropy and Carl (disabled) measures # undef %entropy; # undef %f; # undef %ff; # $total_entropy{$document} = 0; # $total_instances{$document} = 0; # for calculating "offsets" sizes # undef %offsets; # undef %biggest_offsets; open(S, "<$document.stemmed"); open(D, "<$document.distance"); while () { chop($phrase = $_); $tf{$phrase}++; #@p = split(/ +/, $phrase); #$left = join(' ', @p[0..$#p - 1]); #$right = join(' ', @p[1..$#p]); #$f{$left}{$phrase} ++; #$f{$right}{$phrase} ++; #$ff{$left} ++; #$ff{$right} ++; chop($distance = ); # $first_5{$phrase} ++ if ($distance / $document_size < 0.05); #if (!defined($sum{$phrase})) { # $sum{$phrase} = 0; # $sumsquare{$phrase} = 0; #} else { # $difference = $distance - $last{$phrase}; # $sum{$phrase} += $difference; # $sumsquare{$phrase} += ($difference * $difference); #} # $last{$phrase} = $distance; #if (!defined($biggest_offsets{$phrase})) { # @{$offsets{$phrase}} = ($distance); # $biggest_offsets{$phrase} = 1; #} else { # $cutoff = $distance - $offset_clumping_cutoff; # #print "-- \n$phrase \noffsets: @{$offsets{$phrase}}\n"; # push( @{$offsets{$phrase}}, $distance ); # while (@{$offsets{$phrase}}[0] < $cutoff) { # shift( @{$offsets{$phrase}} ); # } # # print "offsets: @{$offsets{$phrase}}\n"; # if ($#{$offsets{$phrase}} >= $biggest_offsets{$phrase}) { # $biggest_offsets{$phrase} = $#{$offsets{$phrase}} + 1; # } #} } close(S); close(D); # undef %offsets; # Calculate complex attribute values foreach $phrase (keys(%tf)) { # Calculate standard deviation of distance between occurances #$interval_sd{$phrase} = # sqrt( ($sumsquare{$phrase} - # ($sum{$phrase} * $sum{$phrase} / $tf{$phrase})) # / $tf{$phrase} ); #undef $sum{$phrase}; #undef $sumsquare{$phrase}; # Calculate Entropy #$t = 0; #$entropy = 0; #$entropy = -1 if (!defined($f{$phrase})); # #foreach $superphrase (keys(%{$f{$phrase}})) { # $t += $f{$phrase}{$superphrase}; #} # #foreach $superphrase (keys(%{$f{$phrase}})) { # $p = $f{$phrase}{$superphrase} / $t; # $entropy -= $p * log($p) / log(2); #} # #$entropy{$phrase} = $entropy; # Calculate Carl statistic #$left = join(' ', @words[0..$#words - 1]); #$right = join(' ', @words[1..$#words]); #$carl1 = $f{$left}{$phrase} / $ff{$left}; #$carl2 = $f{$right}{$phrase} / $ff{$right}; #$carlmin = ($carl1 < $carl2) ? $carl1 : $carl2; #if ($#words == 0) { $carlmin = 1; } #undef $f{$phrase}; #undef $ff{$phrase}; } # Write the arff file. open(S, "<$document.stemmed"); open(U, "<$document.unstemmed"); open(D, "<$document.distance"); undef %seen; while () { chop($phrase = $_); chop($unstemmed = ); chop($distance = ); # only output each phrase once next if (defined($seen{$phrase})); $seen{$phrase} = 1; # ignore phrases of more than N words @words = split(/ +/, $phrase); next if ($#words >= $max_phrase_length); # ignore likely proper-nouns next if ($unstemmed =~ /^PN\-/); # ignore singleton phrases next if (($tf{$phrase} == 1) && $suppress_singleton_phrases); # Calculate TFIDF if (!defined($df{$phrase})) { if ($testfile) { $df{$phrase} = 0 } else { $df{$phrase} = 1; } } $tfidf = ($tf{$phrase} / $document_size) * (log(($numdocs_in_global_corpus + $testfile) / ($df{$phrase} + $testfile)) / log(2)); # Initialise number of words in first 5% # $first_5{$phrase} = 0 if (!defined($first_5{$phrase})); # Calculate class $class = defined($keyphrase{$phrase}) ? "yes" : "no"; # For the Turney corpus, identify the journal ($docnum) = $document =~ /(\d+)\.$suffix/; if (!$Turney || !$docnum) { $journal = "?"; } elsif ($docnum < 7) { $journal = "JAIHR"; } elsif ($docnum < 27) { $journal = "Psycoloquy"; } elsif ($docnum < 29) { $journal = "Neuroscientist"; } elsif ($docnum < 43) { $journal = "JCAMD"; } else { $journal = "BBS"; } # Write an instance to the arff file print ARFF "'$phrase ($unstemmed)',$document,"; if ($Turney) { print ARFF "$journal,"; } if ($keyword_frequency_file) { if ($kf{$phrase}) { if (defined($keyphrase{$phrase}) && $kf_covers_input) { print ARFF ($kf{$phrase} - 1), ","; } else { print ARFF "$kf{$phrase},"; } } else { print ARFF "0,"; } } printf(ARFF "%.3g,%.3g,%s\n", $tfidf, $distance / $document_size, # $entropy{$phrase}, # $first_5{$phrase} / $tf{$phrase}, # $last{$phrase} / $document_size, # $interval_sd{$phrase}, # $biggest_offsets{$phrase}, # $tf{$phrase} / $document_size, # $df{$phrase} + $testfile, $class ); $keyphrase{$phrase} = 0 if (defined($keyphrase{$phrase})); } print ARFF "% Begin missing phrases for $document\n"; foreach $keyphrase (keys(%keyphrase)) { next if ($keyphrase{$keyphrase} == 0); $not_included ++; print ARFF "'$keyphrase', $document, "; if ($Turney) { print ARFF " ?, "; } if ($keyword_frequency_file) { print ARFF " ?, "; } print ARFF "?, ?, ?\n"; } print ARFF "% Finish missing phrases for $document\n"; close(S); close(U); close(D); } # finish off arff file print ARFF "% $not_included key phrases not included\n"; print "% $not_included key phrases not included\n"; print STDERR "k4.pl: $arfffile complete\n\n"; # Is the current phrase a valid Carl phrase? # The phrase is storedin $phrase. sub isValidCarlPhrase () { @wds = split(/ +/, $phrase); $index = 0; $wd = $wds[$index]; while ($index < $#wds) { # current word must be a noun, adj, or verb if (($wd =~ /^$NOUN$/) || ($wd =~ /^$ADJ$/) || ($wd =~ /^$VERB$/)) { $index++; $wd = $wds[$index]; } else { return 0; } # next is an optional incc if ($wd =~ /^$INCC$/) { # it is an incc, so advance one word $index++; $wd = $wds[$index]; } } # since we can advance two in the loop, it's possible to have # run out of input. If this is the case, then the phrase is # not brill, as there's no room for a final NN or VBG if ($index > $#wds) { return 0; } # the last word must be either a noun or a vbg if (($wd =~ /^$VBG$/) || ($wd =~ /^$NOUN$/)) { return 1; } else { return 0; } }