Context Navigation

source: trunk/gsdl/perllib/Kea-1.1.4/k4.pl@ 9156

Last change on this file since 9156 was 4281, checked in by jrm21, 21 years ago
1) use Getopts::Std instead of getopts.pl 2) Only print informative messages if debug is set Kea only: 3) only compile stemmer if it isn't already there. 4) remove temp files unless debug is set k4.pl only: 5) need to move isValidCarlPhrase() before first call
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.0 KB

Line
1	#!/usr/bin/perl -w
2
3	# k4.pl
4	# Version 1.1
5
6	# Kea -- Automatic Keyphrase Extraction
7	# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8	# Contact [email protected] or [email protected]
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24	# Version history
25	#
26	# 1.0 Ready to rebuild all the data for Witten et. al. 1998
27	# 1.0.1 bug: empty @attribute enumerations cause java crashes.
28	# 1.0.2 output number of documents found
29	# 1.0.3 bug: if -F and !defined($df{phrase}) then divide by zero crash.
30	# 1.0.4 Default attributes reduced to phrase, document, tfidf, first, class
31	# 1.0.5 Add keyphrase-frequency file option and attribute
32	# 1.0.6 keyphrase-frequency file corrects for covering input
33	# 1.0.7 use kea-tidy-key-file.pl \| stemmer to clean up key files
34	# 1.0.8 -t toggle uses .tagged instead of .clauses files
35	# 1.0.9 Cleaned up output so it goes to STDERR not STDOUT
36	# 1.0.10 Use $perl_command for system-independence
37	# 1.0.11 List command line parameters if no arguments are given.
38	# 1.1 First Distribution. GPL added. Documentation improved.
39	# 1.1.1 Tweaked output a little; no changes to function
40
41	print STDERR "k4.pl: the arff file builder for Kea\n";
42
43	$gsdlhome = $ENV{'GSDLHOME'};
44
45	# Command line options
46	# k4.pl [options] <input-directory> <output-arff-file>
47	# options are:
48	# -abciFs
49	# -L max length
50	# -f <df file>
51	# -S <stopword file>
52
53	use Getopt::Std;
54	my $opt_i=0;
55	my $opt_t=0;
56	getopt("abciL:f:FsS:kK:t");
57	my $debug=0;
58
59	# What files shall we use?
60	if (!$ARGV[0] \|\| !$ARGV[1]) {
61	print STDERR "usage: k4.pl [options] <input-directory> <output-arff-file>\n";
62	print STDERR " options: -a stopword mode (default)\n";
63	print STDERR " -b brill-phrase mode (requires -t)\n";
64	print STDERR " -c carl-phrase mode (requires -t)\n";
65	print STDERR " -f [global frequency file]\n";
66	print STDERR " -F global frequency file includes input files\n";
67	print STDERR " -i ignore stopwords in word counts\n";
68	print STDERR " -K [keyword frequency file]\n";
69	print STDERR " -k keyword frequency file includes input files\n";
70	print STDERR " -L [maximum phrase length] in words (default = 3)\n";
71	print STDERR " -S [stopword file]\n";
72	print STDERR " -s keep singleton phrases\n";
73	print STDERR " -t input files are .tagged (default is .clauses)\n";
74	die "\n";
75	}
76	my $directory = $ARGV[0];
77	$directory =~ s/\/$//;
78	$arfffile = $ARGV[1];
79
80	if ($debug) {
81	print STDERR "Input directory: $directory\n";
82	print STDERR "Output Arff file: $arfffile\n";
83	}
84
85
86
87
88	# Is the current phrase a valid Carl phrase?
89	# The phrase is storedin $phrase.
90	my ($NOUN, $ADJ, $VERB, $INCC, $VBG); # globals, set below
91
92	sub isValidCarlPhrase () {
93	@wds = split(/ +/, $phrase);
94	$index = 0;
95	$wd = $wds[$index];
96
97	while ($index < $#wds) {
98	# current word must be a noun, adj, or verb
99	if (($wd =~ /^$NOUN$/) \|\| ($wd =~ /^$ADJ$/) \|\| ($wd =~ /^$VERB$/)) {
100	$index++;
101	$wd = $wds[$index];
102	} else {
103	return 0;
104	}
105
106	# next is an optional incc
107	if ($wd =~ /^$INCC$/) {
108	# it is an incc, so advance one word
109	$index++;
110	$wd = $wds[$index];
111	}
112	}
113
114	# since we can advance two in the loop, it's possible to have
115	# run out of input. If this is the case, then the phrase is
116	# not brill, as there's no room for a final NN or VBG
117	if ($index > $#wds) {
118	return 0;
119	}
120
121	# the last word must be either a noun or a vbg
122	if (($wd =~ /^$VBG$/) \|\| ($wd =~ /^$NOUN$/)) {
123	return 1;
124	} else {
125	return 0;
126	}
127	}
128
129
130
131
132
133
134	# Set the maximum phrase length
135	if (($opt_L) && ($opt_L =~ /^\d$/) && ($opt_L > 0)) {
136	$max_phrase_length = $opt_L;
137	} else {
138	$max_phrase_length = 3;
139	}
140
141	if ($debug) {
142	print STDERR "Maximum phrase length: $max_phrase_length\n";
143	}
144
145	# Are we in Stopword mode, Brill mode, or Carl mode?
146	if ($opt_b && $opt_c) {
147	print STDERR "Warning: Brill phrase option overriding Carl phrase option.\n";
148
149	} elsif ($opt_b) {
150	# Brill phrases
151	if ($debug) {print STDERR "Brill phrase mode\n";}
152	$brill_mode = 1;
153
154	$brill_n = "/(NN\|NNS\|NNP\|NNPS\|VBG)\$";
155	$brill_nj = "/(NN\|NNS\|NNP\|NNPS\|JJ)\$";
156
157	} elsif ($opt_c) {
158	# Carl phrases
159	if ($debug) {print STDERR "Carl phrase mode\n";}
160	$carl_mode = 1;
161
162	# regular expressions used to determine brillity
163	$WORD = "[a-zA-Z0-9]+(\-?)[a-zA-Z0-9]+";
164	$NOUN = "($WORD\/NN[A-Z]*)";
165	$ADJ = "($WORD\/JJ[A-Z]*)";
166	$VBG = "({$WORD}ing\/VBG)";
167	$VBN = "({$WORD}ed\/VBN)";
168
169	$VB = "($WORD\/VB)";
170	$VBP = "($WORD\/VBP)";
171	$VERB = "$VBN\|$VB\|$VBG\|$VBP";
172
173	# The OEDs conjunctions -- Carl's stuff
174	$OED_CONJ = "((after\|albeit\|although\|an\|and\|as\|because\|before\|but\|cos\|directly\|either\|except\|for\|howbeit\|if\|immediately\|instantly\|lest\|like\|likewise\|neither\|nevertheless\|nisi\|nor\|notwithstanding\|now\|only\|or\|otherwise\|provided\|providing\|qua\|since\|so\|supposing\|than\|that\|tho\|though\|till\|unless\|until\|when\|whencesoever\|whenever\|whereas\|whether\|while\|whilst\|yet)\/CC)";
175
176	# The OED's list of prepositions -- Carl's stuff
177	$OED_PREP = "((abaft aboard about above across afore after against agin along alongside amid amidst among amongst anent around aslant astride at athwart bar barring before behind below beneath beside besides between betwixt beyond but by circa concerning considering despite down during ere except excepting failing for from in inside into less like mid midst minus near neath next nigh nigher nighest notwithstanding oer of off on onto outside over past pending per plus qua re respecting round sans save saving since thro through throughout thru till to touching toward towards tween twixt under underneath unlike until unto up upon versus via vice wanting with within without)\/IN)";
178
179	# The OED based Carl-phrase wordlist
180	$INCC = "$OED_CONJ\|$OED_PREP";
181
182
183
184	} else {
185	# stopword mode
186	#print STDERR "Stopword mode\n";
187	$stopword_mode = 1;
188
189	$stopword_file = "stopwords";
190	if ($opt_S) {
191	if (-e "$opt_S") {
192	$stopword_file = $opt_S;
193	} else {
194	print STDERR "Can't find stopword file: $opt_S\n";
195	}
196	}
197	if ($debug) {
198	print STDERR "Using stopword file: $stopword_file\n";
199	}
200
201	# read the stopwords
202	open(S, "<$stopword_file");
203	while (<S>) {
204	chomp;
205	$stopword{lc($_)} = 1;
206	}
207	close(S);
208	}
209
210
211	# Should we ignore stopwords in word counts?
212	if ($opt_i) {
213	print STDERR "Ignoring stopwords in word counts.\n";
214	$use_all_words_in_word_count = 0;
215	} else {
216	#print STDERR "Using all words (including stopwords) in word count (default).\n";
217	$use_all_words_in_word_count = 1;
218	}
219
220
221	# Which global-frequency file are we using
222	if ($opt_f) {
223	$document_frequency_file = $opt_f;
224	} else {
225	$document_frequency_file = "document-frequencies";
226	}
227
228	if ($debug) {
229	print STDERR "Document frequency file: $document_frequency_file\n";
230	}
231
232
233	# How is the global-frequency file created?
234	# Note: $testfile is added to the df figure for each phrase.
235	# If the current document is not part of the document frequencies
236	# file then the df figure needs to be increased by 1 to allow
237	# for the fact that the phrase also occurs in the the current file.
238
239	if ($opt_F && (-e "$document_frequency_file")) {
240	if ($debug) {
241	print STDERR "Text files are covered by specified document "
242	. "frequency file\n";
243	}
244	$testfile = 0;
245	} elsif ($opt_F) {
246	die "Document frequency file doesn't exist, -F option impossible\n";
247	} elsif (-e "$document_frequency_file") {
248	if ($debug) {
249	print STDERR "Text files are not covered by document frequency file\n";
250	}
251	$testfile = 1;
252	} else {
253	print STDERR "Document frequency file does not exist: create from training data\n";
254	$testfile = 0;
255	}
256
257	# Which keyword-frequency file are we using
258	if ($opt_K && (-e $opt_K)) {
259	$keyword_frequency_file = $opt_K;
260	if ($debug) {
261	print STDERR "Keyword frequency file: $keyword_frequency_file\n";
262	}
263	if ($opt_k && $opt_k) {
264	if ($debug) {
265	print STDERR "Keyword frequency file covers input files\n";
266	}
267	$kf_covers_input = 1;
268	} else {
269	if ($debug) {
270	print STDERR "Keyword frequency is independent of input\n";
271	}
272	$kf_covers_input = 0;
273	}
274	} elsif ($opt_K) {
275	die "Keyword frequency file doesn't exist, -K option impossible\n";
276	} else {
277	$keyword_frequency_file = "";
278	if ($debug) {print STDERR "No keyword frequency file\n";}
279	}
280
281
282	# Should we show singleton phrases?
283	if ($opt_s && $opt_s) {
284	$suppress_singleton_phrases = 0;
285	print STDERR "Including singleton phrases.\n"
286	} else {
287	$suppress_singleton_phrases = 1;
288	if ($debug) {print STDERR "Ignoring singleton phrases (default).\n"}
289	}
290
291
292	# Do we look for .tagged or .clauses?
293	$suffix = "clauses";
294	if ($opt_t) {
295	$suffix = "tagged";
296	}
297
298	if ($debug) {
299	print STDERR "Input file suffix: $suffix\n";
300	}
301
302	# How to run scripts etc
303	my $perl_command = "perl";
304	my $perl_args = ("-w"); # a list
305
306	# Are we using Turney's data
307	$Turney = 0;
308
309
310
311
312	# Start keyphrase extraction
313
314	# find the documents
315	@documents = split(/\s+/, `find $directory/ -name "*.$suffix"`);
316	if (!$documents[0]) {
317	die "\nk4.pl error: no documents found.";
318	} else {
319	print STDERR "Producing keyphrases for ", $#documents + 1, " documents\n";
320	}
321
322	if ($debug) {print STDERR "Finding candidate phrases...\n";}
323	$document_number = 1;
324
325	foreach $document (@documents) {
326	if ($debug) {print STDERR " document $document_number: $document\n";}
327	$document_number++;
328
329	# if we've already produced this file, skip it.
330	next if (-e "$document.stemmed");
331
332	open(F, "<$document");
333
334	#open(S, "\| ./stemmer > $document.stemmed");
335	open(S, "\| $gsdlhome/perllib/Kea-1.1.4/stemmer > $document.stemmed");
336	open(U, ">$document.unstemmed");
337	open(D, ">$document.distance");
338
339	$distance = 0;
340
341	while (<F>) {
342	chomp;
343
344	# if we're in stopword mode, throw away the tags
345	if ($stopword_mode) {
346	s/\/\S+//g;
347	}
348
349	#work through each line
350	@words = split(/ +/, $_);
351
352	foreach $i (0..$#words) {
353	$finalword = $words[$i];
354
355	# calculate diatance from start (-i not set)
356	if ($use_all_words_in_word_count) {
357	$distance ++;
358	}
359
360	# make sure the last word in the phrase is okay
361	if ($stopword_mode) {
362	# the last word must not be a stopword
363	next if $stopword{lc($finalword)};
364	} elsif ($brill_mode) {
365	# the last word must match ?
366	next unless ($finalword =~ /$brill_n/);
367	} elsif ($carl_mode) {
368	# the last word must be either a noun or a vbg
369	next unless (($finalword =~ /^$VBG$/) \|\|
370	($finalword =~ /^$NOUN$/));
371	}
372
373	# calculate distance from start in -i mode
374	# (this is still flawed for phrases containing stopwords)
375	if (!$use_all_words_in_word_count) {
376	$distance ++;
377	}
378
379	# print $finalword as a phrase of length 1
380	if ($stopword_mode) {
381	print U "$finalword\n";
382	print S "$finalword\n";
383	} else {
384	$untag = $finalword;
385	$untag =~ s/\/\S+$//;
386	print U "$untag\n";
387	print S "$untag\n";
388	}
389	print D $distance, "\n";
390
391	# construct each longer phrase that ends on this word.
392	$phrase = $finalword;
393	$counter = 1;
394
395	foreach $j (1..$max_phrase_length) {
396
397	# make sure we don't try to get a word from "before" the line
398	last if ($j > $i);
399
400	# find the first word of the next phrase
401	$firstword = $words[$i - $j];
402	$phrase = "$firstword $phrase";
403
404	# make sure the phrase is valid
405	if ($stopword_mode) {
406	# the first word may not be a stopword
407	next if $stopword{lc($firstword)};
408	$counter++;
409	} elsif ($brill_mode) {
410	# all words other than the last must match $brill_nj
411	last unless ($firstword =~ /$brill_nj/);
412	$counter++;
413	} elsif ($carl_mode) {
414	# the carl-reg-exp applies
415	if (($firstword =~ /^$VBG$/) \|\| ($firstword =~ /^$NOUN$/)) {
416	$counter++;
417	}
418	next unless (isValidCarlPhrase());
419	}
420
421	# print the phrase
422	if ($stopword_mode) {
423	print U "$phrase\n";
424	print S "$phrase\n";
425	} else {
426	$untag = $phrase;
427	$untag =~ s/\/\S+//g;
428	print U "$untag\n";
429	print S "$untag\n";
430	}
431
432	# print the distance
433	if ($use_all_words_in_word_count) {
434	print D ($distance - $j), "\n";
435	} else {
436	print D ($distance - $counter), "\n";
437	}
438	}
439
440	}
441	}
442
443	close(S);
444	close(U);
445	close(D);
446	close(F);
447
448	$document_size{$document} = $distance; #WILL CHANGE THIS BACK
449
450	system("$perl_command", ($perl_args, "$gsdlhome/perllib/Kea-1.1.4/kea-choose-best-phrase.pl", "$document"));
451	}
452
453	#print STDERR "\nMake sure we know document sizes...\n";
454
455	foreach $document (@documents) {
456	if (!defined($document_size{$document})) {
457	$max = 0;
458	open(D, "<$document.distance");
459	while (<D>) {
460	chomp;
461	if ($_ > $max) { $max = $_; }
462	}
463	close(D);
464	$document_size{$document} = $max;
465	print STDERR "$document has size $document_size{$document}\n";
466	}
467	}
468
469
470
471	# Calculate document frequencies
472
473	if ($debug) {print STDERR "Gathering document frequencies...\n";}
474
475	if (-e "$document_frequency_file") {
476	if ($debug) {
477	print STDERR "Found document frequencies -- reading them!\n";
478	}
479	open(F, "<$document_frequency_file");
480
481	$numdocs_in_global_corpus = <F>;
482	chomp($numdocs_in_global_corpus);
483	print STDERR "$numdocs_in_global_corpus documents in frequency file.\n";
484
485	while (<F>) {
486	@data = split(/ +/, $_);
487	$phrase = join(' ', @data[0..$#data - 1]);
488	$df{$phrase} = $data[$#data];
489	}
490	close(F);
491
492	} else {
493
494	print STDERR "Document frequency file not found... creating\n";
495
496	foreach $document (@documents) {
497	undef %seen;
498	foreach $phrase (split(/\n/, `cat $document.stemmed`)) {
499	$df{$phrase} ++ if (!defined($seen{$phrase}));
500	$seen{$phrase} = 1;
501	}
502	}
503	$numdocs_in_global_corpus = $#documents + 1;
504
505	if ($debug) {print STDERR "Writing document frequencies to file...\n";}
506	open(F, ">$document_frequency_file");
507	print F "$numdocs_in_global_corpus\n";
508	foreach $phrase (keys %df) {
509	@data = split(/ +/, $phrase);
510	# note we keep phrases 1 longer then we need to just in
511	# case we ever want to calculate entropy statistic
512	if ($#data < $max_phrase_length) {
513	print F "$phrase $df{$phrase}\n";
514	}
515	}
516	close(F);
517	}
518
519
520	# Read the keyword frequency file
521	if ($keyword_frequency_file) {
522	if ($debug) {print STDERR "Reading keyword frequency file\n";}
523
524	open(KF, "<$keyword_frequency_file");
525
526	$size_of_keyword_frequency_file = <KF>;
527	chomp($size_of_keyword_frequency_file);
528	if ($debug) {
529	print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n";
530	}
531
532	while (<KF>) {
533	@data = split(/\s+/, $_);
534	$phrase = join(' ', @data[0..$#data - 1]);
535	$kf{$phrase} = $data[$#data];
536	}
537	close(KF);
538	}
539
540
541
542	# What journals are we using?
543	if ($Turney) {
544	$journals{"unknown"} = 1;
545	foreach $document (@documents) {
546	($docnum) = $document =~ /(\d+)\.$suffix/;
547	if ($docnum < 7) {
548	$journals{"JAIHR"} = 1;
549	} elsif ($docnum < 27) {
550	$journals{"Psycoloquy"} = 1;
551	} elsif ($docnum < 29) {
552	$journals{"Neuroscientist"} = 1;
553	} elsif ($docnum < 43) {
554	$journals{"JCAMD"} = 1;
555	} else {
556	$journals{"BBS"} = 1;
557	}
558	}
559	}
560
561
562	# Write the arff files
563
564	if ($debug) {print STDERR "Writing ARFF file\n";}
565
566	open (ARFF, ">$arfffile");
567
568	print ARFF "\@relation keyphrase\n\n";
569	print ARFF "\@attribute phrase string\n";
570	print ARFF "\@attribute document {", join(', ', @documents), "}\n";
571
572	if ($Turney) {
573	print ARFF "\@attribute journal {", join(', ', keys(%journals)), "}\n";
574	}
575	if ($keyword_frequency_file) {
576	print ARFF "\@attribute keyword_freq real\n";
577	}
578
579	print ARFF "\@attribute tfidf real\n";
580	print ARFF "\@attribute first_occurrence real\n";
581
582
583	# print ARFF "\@attribute entropy real\n";
584	# print ARFF "\@attribute in_first_5_percent real\n";
585	# print ARFF "\@attribute last real\n";
586	# print ARFF "\@attribute interval_deviation real\n";
587	# print ARFF "\@attribute max_cluster_of_$offset_clumping_cutoff real\n";
588	# print ARFF "\@attribute term_frequency real\n";
589	# print ARFF "\@attribute document_frequency real\n";
590	print ARFF "\@attribute class {yes,no}\n";
591
592	print ARFF "\n\@data\n";
593
594
595	# the number of keyphrases not covered by the arff file
596	if ($debug) {print STDERR "Writing instances...\n";}
597	$not_included = 0;
598	$document_number = 1;
599
600	foreach $document (@documents) {
601	if ($debug) {print STDERR " document $document_number: $document\n";}
602	$document_number++;
603
604	# load the keyphrases
605	undef %keyphrase;
606
607	$keyphrases = $document;
608	$keyphrases =~ s/$suffix$/key/;
609
610	if (-e $keyphrases) {
611	foreach (split(/\n/, `cat $keyphrases \| $perl_command kea-tidy-key-file.pl \| stemmer`)) {
612	$keyphrase{$_} = 1;
613	}
614	}
615
616	# Calculate document agregates from stemmed file
617
618	# total frequency in the document
619	undef %tf;
620	# the length of the document
621	$document_size = $document_size{$document};
622
623	# number of occurances in the first 5% of the document
624	# undef %first_5;
625	# last occurance
626	# undef %last;
627	# for standard deviation of distance between occurances
628	# undef %interval_sd;
629	# undef %sumsquare;
630	# undef %sum;
631	# for entropy and Carl (disabled) measures
632	# undef %entropy;
633	# undef %f;
634	# undef %ff;
635	# $total_entropy{$document} = 0;
636	# $total_instances{$document} = 0;
637	# for calculating "offsets" sizes
638	# undef %offsets;
639	# undef %biggest_offsets;
640
641	open(S, "<$document.stemmed");
642	open(D, "<$document.distance");
643
644	while (<S>) {
645	chop($phrase = $_);
646	$tf{$phrase}++;
647
648	#@p = split(/ +/, $phrase);
649	#$left = join(' ', @p[0..$#p - 1]);
650	#$right = join(' ', @p[1..$#p]);
651	#$f{$left}{$phrase} ++;
652	#$f{$right}{$phrase} ++;
653	#$ff{$left} ++;
654	#$ff{$right} ++;
655
656	chop($distance = <D>);
657
658	# $first_5{$phrase} ++ if ($distance / $document_size < 0.05);
659
660	#if (!defined($sum{$phrase})) {
661	# $sum{$phrase} = 0;
662	# $sumsquare{$phrase} = 0;
663	#} else {
664	# $difference = $distance - $last{$phrase};
665	# $sum{$phrase} += $difference;
666	# $sumsquare{$phrase} += ($difference * $difference);
667	#}
668
669	# $last{$phrase} = $distance;
670
671	#if (!defined($biggest_offsets{$phrase})) {
672	# @{$offsets{$phrase}} = ($distance);
673	# $biggest_offsets{$phrase} = 1;
674	#} else {
675	# $cutoff = $distance - $offset_clumping_cutoff;
676	# #print "-- \n$phrase \noffsets: @{$offsets{$phrase}}\n";
677	# push( @{$offsets{$phrase}}, $distance );
678	# while (@{$offsets{$phrase}}[0] < $cutoff) {
679	# shift( @{$offsets{$phrase}} );
680	# }
681	# # print "offsets: @{$offsets{$phrase}}\n";
682	# if ($#{$offsets{$phrase}} >= $biggest_offsets{$phrase}) {
683	# $biggest_offsets{$phrase} = $#{$offsets{$phrase}} + 1;
684	# }
685	#}
686	}
687
688	close(S);
689	close(D);
690	# undef %offsets;
691
692
693	# Calculate complex attribute values
694
695	foreach $phrase (keys(%tf)) {
696
697	# Calculate standard deviation of distance between occurances
698	#$interval_sd{$phrase} =
699	# sqrt( ($sumsquare{$phrase} -
700	# ($sum{$phrase} * $sum{$phrase} / $tf{$phrase}))
701	# / $tf{$phrase} );
702	#undef $sum{$phrase};
703	#undef $sumsquare{$phrase};
704
705
706	# Calculate Entropy
707	#$t = 0;
708	#$entropy = 0;
709	#$entropy = -1 if (!defined($f{$phrase}));
710	#
711	#foreach $superphrase (keys(%{$f{$phrase}})) {
712	# $t += $f{$phrase}{$superphrase};
713	#}
714	#
715	#foreach $superphrase (keys(%{$f{$phrase}})) {
716	# $p = $f{$phrase}{$superphrase} / $t;
717	# $entropy -= $p * log($p) / log(2);
718	#}
719	#
720	#$entropy{$phrase} = $entropy;
721
722	# Calculate Carl statistic
723	#$left = join(' ', @words[0..$#words - 1]);
724	#$right = join(' ', @words[1..$#words]);
725
726	#$carl1 = $f{$left}{$phrase} / $ff{$left};
727	#$carl2 = $f{$right}{$phrase} / $ff{$right};
728	#$carlmin = ($carl1 < $carl2) ? $carl1 : $carl2;
729	#if ($#words == 0) { $carlmin = 1; }
730
731
732	#undef $f{$phrase};
733	#undef $ff{$phrase};
734
735	}
736
737
738
739	# Write the arff file.
740	open(S, "<$document.stemmed");
741	open(U, "<$document.unstemmed");
742	open(D, "<$document.distance");
743
744	undef %seen;
745
746	while (<S>) {
747	chop($phrase = $_);
748	chop($unstemmed = <U>);
749	chop($distance = <D>);
750
751	# only output each phrase once
752	next if (defined($seen{$phrase}));
753	$seen{$phrase} = 1;
754
755	# ignore phrases of more than N words
756	@words = split(/ +/, $phrase);
757	next if ($#words >= $max_phrase_length);
758
759	# ignore likely proper-nouns
760	next if ($unstemmed =~ /^PN\-/);
761
762	# ignore singleton phrases
763	next if (($tf{$phrase} == 1) && $suppress_singleton_phrases);
764
765	# Calculate TFIDF
766	if (!defined($df{$phrase})) {
767	if ($testfile) { $df{$phrase} = 0 }
768	else { $df{$phrase} = 1; }
769	}
770	$tfidf = ($tf{$phrase} / $document_size) *
771	(log(($numdocs_in_global_corpus + $testfile) / ($df{$phrase} + $testfile)) / log(2));
772
773	# Initialise number of words in first 5%
774	# $first_5{$phrase} = 0 if (!defined($first_5{$phrase}));
775
776	# Calculate class
777	$class = defined($keyphrase{$phrase}) ? "yes" : "no";
778
779	# For the Turney corpus, identify the journal
780	($docnum) = $document =~ /(\d+)\.$suffix/;
781	if (!$Turney \|\| !$docnum) {
782	$journal = "?";
783	} elsif ($docnum < 7) {
784	$journal = "JAIHR";
785	} elsif ($docnum < 27) {
786	$journal = "Psycoloquy";
787	} elsif ($docnum < 29) {
788	$journal = "Neuroscientist";
789	} elsif ($docnum < 43) {
790	$journal = "JCAMD";
791	} else {
792	$journal = "BBS";
793	}
794
795
796	# Write an instance to the arff file
797	print ARFF "'$phrase ($unstemmed)',$document,";
798	if ($Turney) { print ARFF "$journal,"; }
799	if ($keyword_frequency_file) {
800	if ($kf{$phrase}) {
801	if (defined($keyphrase{$phrase}) && $kf_covers_input) {
802	print ARFF ($kf{$phrase} - 1), ",";
803	} else {
804	print ARFF "$kf{$phrase},";
805	}
806	} else {
807	print ARFF "0,";
808	}
809	}
810	printf(ARFF "%.3g,%.3g,%s\n",
811	$tfidf,
812	$distance / $document_size,
813	# $entropy{$phrase},
814	# $first_5{$phrase} / $tf{$phrase},
815	# $last{$phrase} / $document_size,
816	# $interval_sd{$phrase},
817	# $biggest_offsets{$phrase},
818	# $tf{$phrase} / $document_size,
819	# $df{$phrase} + $testfile,
820	$class
821	);
822
823	$keyphrase{$phrase} = 0 if (defined($keyphrase{$phrase}));
824
825	}
826
827	print ARFF "% Begin missing phrases for $document\n";
828	foreach $keyphrase (keys(%keyphrase)) {
829	next if ($keyphrase{$keyphrase} == 0);
830	$not_included ++;
831	print ARFF "'$keyphrase', $document, ";
832	if ($Turney) { print ARFF " ?, "; }
833	if ($keyword_frequency_file) { print ARFF " ?, "; }
834	print ARFF "?, ?, ?\n";
835	}
836	print ARFF "% Finish missing phrases for $document\n";
837
838	close(S);
839	close(U);
840	close(D);
841	}
842
843	# finish off arff file
844	print ARFF "% $not_included key phrases not included\n";
845	print "% $not_included key phrases not included\n";
846
847
848	if ($debug) {print STDERR "k4.pl: $arfffile complete\n\n";}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: