Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/Kea-1.1.4/k4.pl@ 1972

Last change on this file since 1972 was 1972, checked in by jmt14, 23 years ago
* empty log message *
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 22.4 KB

Line
1	#!/usr/bin/perl -w
2
3	# k4.pl
4	# Version 1.1
5
6	# Kea -- Automatic Keyphrase Extraction
7	# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8	# Contact [email protected] or [email protected]
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24	# Version history
25	#
26	# 1.0 Ready to rebuild all the data for Witten et. al. 1998
27	# 1.0.1 bug: empty @attribute enumerations cause java crashes.
28	# 1.0.2 output number of documents found
29	# 1.0.3 bug: if -F and !defined($df{phrase}) then divide by zero crash.
30	# 1.0.4 Default attributes reduced to phrase, document, tfidf, first, class
31	# 1.0.5 Add keyphrase-frequency file option and attribute
32	# 1.0.6 keyphrase-frequency file corrects for covering input
33	# 1.0.7 use kea-tidy-key-file.pl \| stemmer to clean up key files
34	# 1.0.8 -t toggle uses .tagged instead of .clauses files
35	# 1.0.9 Cleaned up output so it goes to STDERR not STDOUT
36	# 1.0.10 Use $perl_command for system-independence
37	# 1.0.11 List command line parameters if no arguments are given.
38	# 1.1 First Distribution. GPL added. Documentation improved.
39	# 1.1.1 Tweaked output a little; no changes to function
40
41	print STDERR "\nk4.pl: the arff file builder for Kea\n";
42
43	$gsdlhome = $ENV{'GSDLHOME'};
44
45	# Command line options
46	# k4.pl [options] <input-directory> <output-arff-file>
47	# options are:
48	# -abciFs
49	# -L max length
50	# -f <df file>
51	# -S <stopword file>
52
53	require("getopts.pl");
54	&Getopts("abciL:f:FsS:kK:t");
55
56	# What files shall we use?
57	if (!$ARGV[0] \|\| !$ARGV[1]) {
58	print STDERR "usage: k4.pl [options] <input-directory> <output-arff-file>\n";
59	print STDERR " options: -a stopword mode (default)\n";
60	print STDERR " -b brill-phrase mode (requires -t)\n";
61	print STDERR " -c carl-phrase mode (requires -t)\n";
62	print STDERR " -f [global frequency file]\n";
63	print STDERR " -F global frequency file includes input files\n";
64	print STDERR " -i ignore stopwords in word counts\n";
65	print STDERR " -K [keyword frequency file]\n";
66	print STDERR " -k keyword frequency file includes input files\n";
67	print STDERR " -L [maximum phrase length] in words (default = 3)\n";
68	print STDERR " -S [stopword file]\n";
69	print STDERR " -s keep singleton phrases\n";
70	print STDERR " -t input files are .tagged (default is .clauses)\n";
71	die "\n";
72	}
73	$directory = $ARGV[0];
74	$directory =~ s/\/$//;
75	$arfffile = $ARGV[1];
76	#print STDERR "Input directory: $directory\n";
77	#print STDERR "Output Arff file: $arfffile\n";
78
79
80	# Set the maximum phrase length
81	if (($opt_L) && ($opt_L =~ /^\d$/) && ($opt_L > 0)) {
82	$max_phrase_length = $opt_L;
83	} else {
84	$max_phrase_length = 3;
85	}
86	print STDERR "Maximum phrase length: $max_phrase_length\n";
87
88
89	# Are we in Stopword mode, Brill mode, or Carl mode?
90	if ($opt_b && $opt_c) {
91	print STDERR "Warning: Brill phrase option overriding Carl phrase option.\n";
92
93	} elsif ($opt_b) {
94	# Brill phrases
95	print STDERR "Brill phrase mode\n";
96	$brill_mode = 1;
97
98	$brill_n = "/(NN\|NNS\|NNP\|NNPS\|VBG)\$";
99	$brill_nj = "/(NN\|NNS\|NNP\|NNPS\|JJ)\$";
100
101	} elsif ($opt_c) {
102	# Carl phrases
103	print STDERR "Carl phrase mode\n";
104	$carl_mode = 1;
105
106	# regular expressions used to determine brillity
107	$WORD = "[a-zA-Z0-9]+(\-?)[a-zA-Z0-9]+";
108	$NOUN = "($WORD\/NN[A-Z]*)";
109	$ADJ = "($WORD\/JJ[A-Z]*)";
110	$VBG = "($WORD\ing\/VBG)";
111	$VBN = "($WORD\ed\/VBN)";
112
113	$VB = "($WORD\/VB)";
114	$VBP = "($WORD\/VBP)";
115	$VERB = "$VBN\|$VB\|$VBG\|$VBP";
116
117	# The OEDs conjunctions -- Carl's stuff
118	$OED_CONJ = "((after\|albeit\|although\|an\|and\|as\|because\|before\|but\|cos\|directly\|either\|except\|for\|howbeit\|if\|immediately\|instantly\|lest\|like\|likewise\|neither\|nevertheless\|nisi\|nor\|notwithstanding\|now\|only\|or\|otherwise\|provided\|providing\|qua\|since\|so\|supposing\|than\|that\|tho\|though\|till\|unless\|until\|when\|whencesoever\|whenever\|whereas\|whether\|while\|whilst\|yet)\/CC)";
119
120	# The OED's list of prepositions -- Carl's stuff
121	$OED_PREP = "((abaft aboard about above across afore after against agin along alongside amid amidst among amongst anent around aslant astride at athwart bar barring before behind below beneath beside besides between betwixt beyond but by circa concerning considering despite down during ere except excepting failing for from in inside into less like mid midst minus near neath next nigh nigher nighest notwithstanding oer of off on onto outside over past pending per plus qua re respecting round sans save saving since thro through throughout thru till to touching toward towards tween twixt under underneath unlike until unto up upon versus via vice wanting with within without)\/IN)";
122
123	# The OED based Carl-phrase wordlist
124	$INCC = "$OED_CONJ\|$OED_PREP";
125
126
127
128	} else {
129	# stopword mode
130	#print STDERR "Stopword mode\n";
131	$stopword_mode = 1;
132
133	$stopword_file = "stopwords";
134	if ($opt_S) {
135	if (-e "$opt_S") {
136	$stopword_file = $opt_S;
137	} else {
138	print STDERR "Can't find stopword file: $opt_S\n";
139	}
140	}
141	print STDERR "Using stopword file: $stopword_file\n";
142
143	# read the stopwords
144	open(S, "<$stopword_file");
145	while (<S>) {
146	chomp;
147	$stopword{lc($_)} = 1;
148	}
149	close(S);
150	}
151
152
153	# Should we ignore stopwords in word counts?
154	if ($opt_i && $opt_i) {
155	print STDERR "Ignoring stopwords in word counts.\n";
156	$use_all_words_in_word_count = 0;
157	} else {
158	#print STDERR "Using all words (including stopwords) in word count (default).\n";
159	$use_all_words_in_word_count = 1;
160	}
161
162
163	# Which global-frequency file are we using
164	if ($opt_f) {
165	$document_frequency_file = $opt_f;
166	} else {
167	$document_frequency_file = "document-frequencies";
168	}
169	print STDERR "Document frequency file: $document_frequency_file\n";
170
171
172	# How is the global-frequency file created?
173	# Note: $testfile is added to the df figure for each phrase.
174	# If the current document is not part of the document frequencies
175	# file then the df figure needs to be increased by 1 to allow
176	# for the fact that the phrase also occurs in the the current file.
177
178	if ($opt_F && (-e "$document_frequency_file")) {
179	print STDERR "Text files are covered by specified document frequency file\n";
180	$testfile = 0;
181	} elsif ($opt_F) {
182	die "Document frequency file doesn't exist, -F option impossible\n";
183	} elsif (-e "$document_frequency_file") {
184	print STDERR "Text files are not covered by document frequency file\n";
185	$testfile = 1;
186	} else {
187	print STDERR "Document frequency file does not exist: create from training data\n";
188	$testfile = 0;
189	}
190
191	# Which keyword-frequency file are we using
192	if ($opt_K && (-e $opt_K)) {
193	$keyword_frequency_file = $opt_K;
194	print STDERR "Keyword frequency file: $keyword_frequency_file\n";
195	if ($opt_k && $opt_k) {
196	print STDERR "Keyword frequency file covers input files\n";
197	$kf_covers_input = 1;
198	} else {
199	print STDERR "Keyword frequency is independent of input\n";
200	$kf_covers_input = 0;
201	}
202	} elsif ($opt_K) {
203	die "Keyword frequency file doesn't exist, -K option impossible\n";
204	} else {
205	$keyword_frequency_file = "";
206	print STDERR "No keyword frequency file\n";
207	}
208
209
210	# Should we show singleton phrases?
211	if ($opt_s && $opt_s) {
212	$suppress_singleton_phrases = 0;
213	print STDERR "Including singleton phrases.\n"
214	} else {
215	$suppress_singleton_phrases = 1;
216	print STDERR "Ignoring singleton phrases (default).\n"
217	}
218
219
220	# Do we look for .tagged or .clauses?
221	$suffix = "clauses";
222	if ($opt_t && $opt_t) {
223	$suffix = "tagged";
224	}
225	print STDERR "Input file suffix: $suffix\n";
226
227	# How to run scripts etc
228	$perl_command = "perl -w";
229
230	# Are we using Turney's data
231	$Turney = 0;
232
233
234
235
236	# Start keyphrase extraction
237
238	# find the documents
239	@documents = split(/\s+/, `find $directory/ -name "*.$suffix"`);
240	if (!$documents[0]) {
241	die "\nk4.pl error: no documents found.";
242	} else {
243	print STDERR "\nProducing keyphrases for ", $#documents + 1, " documents\n";
244	}
245
246	print STDERR "Finding candidate phrases...\n";
247	$document_number = 1;
248
249	foreach $document (@documents) {
250	#print STDERR " document $document_number: $document\r";
251	$document_number++;
252
253	# if we've already produced this file, skip it.
254	next if (-e "$document.stemmed");
255
256	open(F, "<$document");
257
258	#open(S, "\| ./stemmer > $document.stemmed");
259	open(S, "\| $gsdlhome/perllib/Kea-1.1.4/stemmer > $document.stemmed");
260	open(U, ">$document.unstemmed");
261	open(D, ">$document.distance");
262
263	$distance = 0;
264
265	while (<F>) {
266	chomp;
267
268	# if we're in stopword mode, throw away the tags
269	if ($stopword_mode) {
270	s/\/\S+//g;
271	}
272
273	#work through each line
274	@words = split(/ +/, $_);
275
276	foreach $i (0..$#words) {
277	$finalword = $words[$i];
278
279	# calculate diatance from start (-i not set)
280	if ($use_all_words_in_word_count) {
281	$distance ++;
282	}
283
284	# make sure the last word in the phrase is okay
285	if ($stopword_mode) {
286	# the last word must not be a stopword
287	next if $stopword{lc($finalword)};
288	} elsif ($brill_mode) {
289	# the last word must match ?
290	next unless ($finalword =~ /$brill_n/);
291	} elsif ($carl_mode) {
292	# the last word must be either a noun or a vbg
293	next unless (($finalword =~ /^$VBG$/) \|\|
294	($finalword =~ /^$NOUN$/));
295	}
296
297	# calculate distance from start in -i mode
298	# (this is still flawed for phrases containing stopwords)
299	if (!$use_all_words_in_word_count) {
300	$distance ++;
301	}
302
303	# print $finalword as a phrase of length 1
304	if ($stopword_mode) {
305	print U "$finalword\n";
306	print S "$finalword\n";
307	} else {
308	$untag = $finalword;
309	$untag =~ s/\/\S+$//;
310	print U "$untag\n";
311	print S "$untag\n";
312	}
313	print D $distance, "\n";
314
315	# construct each longer phrase that ends on this word.
316	$phrase = $finalword;
317	$counter = 1;
318
319	foreach $j (1..$max_phrase_length) {
320
321	# make sure we don't try to get a word from "before" the line
322	last if ($j > $i);
323
324	# find the first word of the next phrase
325	$firstword = $words[$i - $j];
326	$phrase = "$firstword $phrase";
327
328	# make sure the phrase is valid
329	if ($stopword_mode) {
330	# the first word may not be a stopword
331	next if $stopword{lc($firstword)};
332	$counter++;
333	} elsif ($brill_mode) {
334	# all words other than the last must match $brill_nj
335	last unless ($firstword =~ /$brill_nj/);
336	$counter++;
337	} elsif ($carl_mode) {
338	# the carl-reg-exp applies
339	if (($firstword =~ /^$VBG$/) \|\| ($firstword =~ /^$NOUN$/)) {
340	$counter++;
341	}
342	next unless (isValidCarlPhrase());
343	}
344
345	# print the phrase
346	if ($stopword_mode) {
347	print U "$phrase\n";
348	print S "$phrase\n";
349	} else {
350	$untag = $phrase;
351	$untag =~ s/\/\S+//g;
352	print U "$untag\n";
353	print S "$untag\n";
354	}
355
356	# print the distance
357	if ($use_all_words_in_word_count) {
358	print D ($distance - $j), "\n";
359	} else {
360	print D ($distance - $counter), "\n";
361	}
362	}
363
364	}
365	}
366
367	close(S);
368	close(U);
369	close(D);
370	close(F);
371
372	$document_size{$document} = $distance; #WILL CHANGE THIS BACK
373	`$perl_command /home/jmt14/gsdl/perllib/Kea-1.1.4/kea-choose-best-phrase.pl $document`;
374	}
375
376	#print STDERR "\nMake sure we know document sizes...\n";
377
378	foreach $document (@documents) {
379	if (!defined($document_size{$document})) {
380	$max = 0;
381	open(D, "<$document.distance");
382	while (<D>) {
383	chomp;
384	if ($_ > $max) { $max = $_; }
385	}
386	close(D);
387	$document_size{$document} = $max;
388	print STDERR "$document has size $document_size{$document}\n";
389	}
390	}
391
392
393
394	# Calculate document frequencies
395
396	print STDERR "Gathering document frequencies...\n";
397
398	if (-e "$document_frequency_file") {
399
400	print STDERR "Found document frequencies -- reading them!\n";
401
402	open(F, "<$document_frequency_file");
403
404	$numdocs_in_global_corpus = <F>;
405	chomp($numdocs_in_global_corpus);
406	print STDERR "$numdocs_in_global_corpus documents in frequency file.\n";
407
408	while (<F>) {
409	@data = split(/ +/, $_);
410	$phrase = join(' ', @data[0..$#data - 1]);
411	$df{$phrase} = $data[$#data];
412	}
413	close(F);
414
415	} else {
416
417	#print STDERR "Document frequency file not found... creating\n";
418
419	foreach $document (@documents) {
420	undef %seen;
421	foreach $phrase (split(/\n/, `cat $document.stemmed`)) {
422	$df{$phrase} ++ if (!defined($seen{$phrase}));
423	$seen{$phrase} = 1;
424	}
425	}
426	$numdocs_in_global_corpus = $#documents + 1;
427
428	print STDERR "Writing document frequencies to file...\n";
429	open(F, ">$document_frequency_file");
430	print F "$numdocs_in_global_corpus\n";
431	foreach $phrase (keys %df) {
432	@data = split(/ +/, $phrase);
433	# note we keep phrases 1 longer then we need to just in
434	# case we ever want to calculate entropy statistic
435	if ($#data < $max_phrase_length) {
436	print F "$phrase $df{$phrase}\n";
437	}
438	}
439	close(F);
440	}
441
442
443	# Read the keyword frequency file
444	if ($keyword_frequency_file) {
445	print STDERR "Reading keyword frequency file\n";
446
447	open(KF, "<$keyword_frequency_file");
448
449	$size_of_keyword_frequency_file = <KF>;
450	chomp($size_of_keyword_frequency_file);
451	print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n";
452
453	while (<KF>) {
454	@data = split(/\s+/, $_);
455	$phrase = join(' ', @data[0..$#data - 1]);
456	$kf{$phrase} = $data[$#data];
457	}
458	close(KF);
459	}
460
461
462
463	# What journals are we using?
464	if ($Turney) {
465	$journals{"unknown"} = 1;
466	foreach $document (@documents) {
467	($docnum) = $document =~ /(\d+)\.$suffix/;
468	if ($docnum < 7) {
469	$journals{"JAIHR"} = 1;
470	} elsif ($docnum < 27) {
471	$journals{"Psycoloquy"} = 1;
472	} elsif ($docnum < 29) {
473	$journals{"Neuroscientist"} = 1;
474	} elsif ($docnum < 43) {
475	$journals{"JCAMD"} = 1;
476	} else {
477	$journals{"BBS"} = 1;
478	}
479	}
480	}
481
482
483	# Write the arff files
484
485	print STDERR "Writing ARFF file\n";
486
487	open (ARFF, ">$arfffile");
488
489	print ARFF "\@relation keyphrase\n\n";
490	print ARFF "\@attribute phrase string\n";
491	print ARFF "\@attribute document {", join(', ', @documents), "}\n";
492
493	if ($Turney) {
494	print ARFF "\@attribute journal {", join(', ', keys(%journals)), "}\n";
495	}
496	if ($keyword_frequency_file) {
497	print ARFF "\@attribute keyword_freq real\n";
498	}
499
500	print ARFF "\@attribute tfidf real\n";
501	print ARFF "\@attribute first_occurrence real\n";
502
503
504	# print ARFF "\@attribute entropy real\n";
505	# print ARFF "\@attribute in_first_5_percent real\n";
506	# print ARFF "\@attribute last real\n";
507	# print ARFF "\@attribute interval_deviation real\n";
508	# print ARFF "\@attribute max_cluster_of_$offset_clumping_cutoff real\n";
509	# print ARFF "\@attribute term_frequency real\n";
510	# print ARFF "\@attribute document_frequency real\n";
511	print ARFF "\@attribute class {yes,no}\n";
512
513	print ARFF "\n\@data\n";
514
515
516	# the number of keyphrases not covered by the arff file
517	print STDERR "Writing instances...\n";
518	$not_included = 0;
519	$document_number = 1;
520
521	foreach $document (@documents) {
522	print STDERR " document $document_number: $document\r";
523	$document_number++;
524
525	# load the keyphrases
526	undef %keyphrase;
527
528	$keyphrases = $document;
529	$keyphrases =~ s/$suffix$/key/;
530
531	if (-e $keyphrases) {
532	foreach (split(/\n/, `cat $keyphrases \| $perl_command kea-tidy-key-file.pl \| stemmer`)) {
533	$keyphrase{$_} = 1;
534	}
535	}
536
537	# Calculate document agregates from stemmed file
538
539	# total frequency in the document
540	undef %tf;
541	# the length of the document
542	$document_size = $document_size{$document};
543
544	# number of occurances in the first 5% of the document
545	# undef %first_5;
546	# last occurance
547	# undef %last;
548	# for standard deviation of distance between occurances
549	# undef %interval_sd;
550	# undef %sumsquare;
551	# undef %sum;
552	# for entropy and Carl (disabled) measures
553	# undef %entropy;
554	# undef %f;
555	# undef %ff;
556	# $total_entropy{$document} = 0;
557	# $total_instances{$document} = 0;
558	# for calculating "offsets" sizes
559	# undef %offsets;
560	# undef %biggest_offsets;
561
562	open(S, "<$document.stemmed");
563	open(D, "<$document.distance");
564
565	while (<S>) {
566	chop($phrase = $_);
567	$tf{$phrase}++;
568
569	#@p = split(/ +/, $phrase);
570	#$left = join(' ', @p[0..$#p - 1]);
571	#$right = join(' ', @p[1..$#p]);
572	#$f{$left}{$phrase} ++;
573	#$f{$right}{$phrase} ++;
574	#$ff{$left} ++;
575	#$ff{$right} ++;
576
577	chop($distance = <D>);
578
579	# $first_5{$phrase} ++ if ($distance / $document_size < 0.05);
580
581	#if (!defined($sum{$phrase})) {
582	# $sum{$phrase} = 0;
583	# $sumsquare{$phrase} = 0;
584	#} else {
585	# $difference = $distance - $last{$phrase};
586	# $sum{$phrase} += $difference;
587	# $sumsquare{$phrase} += ($difference * $difference);
588	#}
589
590	# $last{$phrase} = $distance;
591
592	#if (!defined($biggest_offsets{$phrase})) {
593	# @{$offsets{$phrase}} = ($distance);
594	# $biggest_offsets{$phrase} = 1;
595	#} else {
596	# $cutoff = $distance - $offset_clumping_cutoff;
597	# #print "-- \n$phrase \noffsets: @{$offsets{$phrase}}\n";
598	# push( @{$offsets{$phrase}}, $distance );
599	# while (@{$offsets{$phrase}}[0] < $cutoff) {
600	# shift( @{$offsets{$phrase}} );
601	# }
602	# # print "offsets: @{$offsets{$phrase}}\n";
603	# if ($#{$offsets{$phrase}} >= $biggest_offsets{$phrase}) {
604	# $biggest_offsets{$phrase} = $#{$offsets{$phrase}} + 1;
605	# }
606	#}
607	}
608
609	close(S);
610	close(D);
611	# undef %offsets;
612
613
614	# Calculate complex attribute values
615
616	foreach $phrase (keys(%tf)) {
617
618	# Calculate standard deviation of distance between occurances
619	#$interval_sd{$phrase} =
620	# sqrt( ($sumsquare{$phrase} -
621	# ($sum{$phrase} * $sum{$phrase} / $tf{$phrase}))
622	# / $tf{$phrase} );
623	#undef $sum{$phrase};
624	#undef $sumsquare{$phrase};
625
626
627	# Calculate Entropy
628	#$t = 0;
629	#$entropy = 0;
630	#$entropy = -1 if (!defined($f{$phrase}));
631	#
632	#foreach $superphrase (keys(%{$f{$phrase}})) {
633	# $t += $f{$phrase}{$superphrase};
634	#}
635	#
636	#foreach $superphrase (keys(%{$f{$phrase}})) {
637	# $p = $f{$phrase}{$superphrase} / $t;
638	# $entropy -= $p * log($p) / log(2);
639	#}
640	#
641	#$entropy{$phrase} = $entropy;
642
643	# Calculate Carl statistic
644	#$left = join(' ', @words[0..$#words - 1]);
645	#$right = join(' ', @words[1..$#words]);
646
647	#$carl1 = $f{$left}{$phrase} / $ff{$left};
648	#$carl2 = $f{$right}{$phrase} / $ff{$right};
649	#$carlmin = ($carl1 < $carl2) ? $carl1 : $carl2;
650	#if ($#words == 0) { $carlmin = 1; }
651
652
653	#undef $f{$phrase};
654	#undef $ff{$phrase};
655
656	}
657
658
659
660	# Write the arff file.
661	open(S, "<$document.stemmed");
662	open(U, "<$document.unstemmed");
663	open(D, "<$document.distance");
664
665	undef %seen;
666
667	while (<S>) {
668	chop($phrase = $_);
669	chop($unstemmed = <U>);
670	chop($distance = <D>);
671
672	# only output each phrase once
673	next if (defined($seen{$phrase}));
674	$seen{$phrase} = 1;
675
676	# ignore phrases of more than N words
677	@words = split(/ +/, $phrase);
678	next if ($#words >= $max_phrase_length);
679
680	# ignore likely proper-nouns
681	next if ($unstemmed =~ /^PN\-/);
682
683	# ignore singleton phrases
684	next if (($tf{$phrase} == 1) && $suppress_singleton_phrases);
685
686	# Calculate TFIDF
687	if (!defined($df{$phrase})) {
688	if ($testfile) { $df{$phrase} = 0 }
689	else { $df{$phrase} = 1; }
690	}
691	$tfidf = ($tf{$phrase} / $document_size) *
692	(log(($numdocs_in_global_corpus + $testfile) / ($df{$phrase} + $testfile)) / log(2));
693
694	# Initialise number of words in first 5%
695	# $first_5{$phrase} = 0 if (!defined($first_5{$phrase}));
696
697	# Calculate class
698	$class = defined($keyphrase{$phrase}) ? "yes" : "no";
699
700	# For the Turney corpus, identify the journal
701	($docnum) = $document =~ /(\d+)\.$suffix/;
702	if (!$Turney \|\| !$docnum) {
703	$journal = "?";
704	} elsif ($docnum < 7) {
705	$journal = "JAIHR";
706	} elsif ($docnum < 27) {
707	$journal = "Psycoloquy";
708	} elsif ($docnum < 29) {
709	$journal = "Neuroscientist";
710	} elsif ($docnum < 43) {
711	$journal = "JCAMD";
712	} else {
713	$journal = "BBS";
714	}
715
716
717	# Write an instance to the arff file
718	print ARFF "'$phrase ($unstemmed)',$document,";
719	if ($Turney) { print ARFF "$journal,"; }
720	if ($keyword_frequency_file) {
721	if ($kf{$phrase}) {
722	if (defined($keyphrase{$phrase}) && $kf_covers_input) {
723	print ARFF ($kf{$phrase} - 1), ",";
724	} else {
725	print ARFF "$kf{$phrase},";
726	}
727	} else {
728	print ARFF "0,";
729	}
730	}
731	printf(ARFF "%.3g,%.3g,%s\n",
732	$tfidf,
733	$distance / $document_size,
734	# $entropy{$phrase},
735	# $first_5{$phrase} / $tf{$phrase},
736	# $last{$phrase} / $document_size,
737	# $interval_sd{$phrase},
738	# $biggest_offsets{$phrase},
739	# $tf{$phrase} / $document_size,
740	# $df{$phrase} + $testfile,
741	$class
742	);
743
744	$keyphrase{$phrase} = 0 if (defined($keyphrase{$phrase}));
745
746	}
747
748	print ARFF "% Begin missing phrases for $document\n";
749	foreach $keyphrase (keys(%keyphrase)) {
750	next if ($keyphrase{$keyphrase} == 0);
751	$not_included ++;
752	print ARFF "'$keyphrase', $document, ";
753	if ($Turney) { print ARFF " ?, "; }
754	if ($keyword_frequency_file) { print ARFF " ?, "; }
755	print ARFF "?, ?, ?\n";
756	}
757	print ARFF "% Finish missing phrases for $document\n";
758
759	close(S);
760	close(U);
761	close(D);
762	}
763
764	# finish off arff file
765	print ARFF "% $not_included key phrases not included\n";
766	print "% $not_included key phrases not included\n";
767
768
769	print STDERR "k4.pl: $arfffile complete\n\n";
770
771
772
773
774
775	# Is the current phrase a valid Carl phrase?
776	# The phrase is storedin $phrase.
777
778
779	sub isValidCarlPhrase () {
780	@wds = split(/ +/, $phrase);
781	$index = 0;
782	$wd = $wds[$index];
783
784	while ($index < $#wds) {
785	# current word must be a noun, adj, or verb
786	if (($wd =~ /^$NOUN$/) \|\| ($wd =~ /^$ADJ$/) \|\| ($wd =~ /^$VERB$/)) {
787	$index++;
788	$wd = $wds[$index];
789	} else {
790	return 0;
791	}
792
793	# next is an optional incc
794	if ($wd =~ /^$INCC$/) {
795	# it is an incc, so advance one word
796	$index++;
797	$wd = $wds[$index];
798	}
799	}
800
801	# since we can advance two in the loop, it's possible to have
802	# run out of input. If this is the case, then the phrase is
803	# not brill, as there's no room for a final NN or VBG
804	if ($index > $#wds) {
805	return 0;
806	}
807
808	# the last word must be either a noun or a vbg
809	if (($wd =~ /^$VBG$/) \|\| ($wd =~ /^$NOUN$/)) {
810	return 1;
811	} else {
812	return 0;
813	}
814	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: