Changeset 4281
- Timestamp:
- 2003-05-14T16:16:07+12:00 (21 years ago)
- Location:
- trunk/gsdl/perllib/Kea-1.1.4
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/Kea-1.1.4/Kea
r1995 r4281 44 44 # 1.1.4 Updated documentation and added a few extra files. 45 45 46 print STDERR " \nKea (version 1.1.4): automatic keyphrase extraction\n";46 print STDERR "Kea (version 1.1.4): automatic keyphrase extraction\n"; 47 47 48 48 $gsdlhome = $ENV{'GSDLHOME'}; 49 49 50 `gcc -o $gsdlhome/perllib/Kea-1.1.4/stemmer $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c`; 51 50 if (! -x "$gsdlhome/perllib/Kea-1.1.4/stemmer") { 51 print STDERR "** need to compile stemmer **\n"; 52 system("gcc", ("-o", "$gsdlhome/perllib/Kea-1.1.4/stemmer", 53 "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c", 54 "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c" 55 )); 56 } 52 57 53 58 # Kea runs other perl scripts in shells. … … 80 85 81 86 # Parse command line options 82 require("getopts.pl"); 83 &Getopts("dtN:E:C:F:K:L:M:S:"); 87 use Getopt::Std; 88 my $opt_d=0; # debug option 89 getopts("dtN:E:C:F:K:L:M:S:"); 84 90 85 91 # What files shall we use? … … 103 109 104 110 # Number of phrases to extract 105 if ( ($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {111 if (defined($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) { 106 112 $number_of_phrases = "-N $opt_N"; 107 113 print STDERR "Number of phrases to extract: $opt_N\n"; 108 } elsif ( $opt_N) {114 } elsif (defined($opt_N)) { 109 115 die "Kea cannot understand -N argument (must be a number): $opt_N\n"; 110 116 } else { 111 print STDERR "Number of phrases dictated by model (default)\n"; 117 if ($opt_d) { 118 print STDERR "Number of phrases dictated by model (default)\n"; 119 } 112 120 $number_of_phrases = ""; 113 121 } … … 115 123 # -L maximum phrase length 116 124 $maximum_phrase_length = 0; 117 if ( ($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {125 if (defined($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) { 118 126 $maximum_phrase_length = "$opt_L"; 119 127 print STDERR "Maximum phrase length: $opt_L\n"; 120 } elsif ( $opt_L) {128 } elsif (defined($opt_L)) { 121 129 die "Kea cannot understand -L argument (must be a number): $opt_L\n"; 122 130 } … … 124 132 # -E What extension shall we use? 125 133 $extension = "kea"; 126 if ( $opt_E) {134 if (defined($opt_E)) { 127 135 $extension = $opt_E; 128 136 $extension =~ s/^\.//g; … … 131 139 132 140 # -t Should we output tfidf? (This option is used by Kniles.) 133 if ( $opt_t&& $opt_t) {141 if (defined($opt_t) && $opt_t) { 134 142 $output_tfidf = 1; 135 143 print STDERR "Do print tf.idf\n"; … … 141 149 # -C Corpus file stem 142 150 $default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb"; 143 if ( $opt_C) {151 if (defined($opt_C)) { 144 152 print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n"; 145 153 $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C"; … … 155 163 156 164 # -F Document Frequency file 157 $frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F" if ($opt_F); 158 print STDERR "Document frequency file: $frequency_file\n"; 165 if (defined($opt_F)) {$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F";} 166 if ($opt_d) {print STDERR "Document frequency file: $frequency_file\n";} 159 167 die "Document frequency file does not exist!\n" if (!(-e $frequency_file)); 160 168 161 169 # -M Model file 162 $model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M" if ($opt_M); 163 print STDERR "Model file: $model_file\n"; 170 if (defined($opt_M)) {$model_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_M";} 171 if ($opt_d) {print STDERR "Model file: $model_file\n";} 164 172 die "Model file does not exist!\n" if (!(-e $model_file)); 165 173 166 174 # -S Stopword file 167 $stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S" if ($opt_S); 168 print STDERR "Stopword file: $stopword_file\n"; 175 if (defined($opt_S)) {$stopword_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_S";} 176 if ($opt_d) {print STDERR "Stopword file: $stopword_file\n";} 169 177 die "Stopword file does not exist!\n" if (!(-e $stopword_file)); 170 178 171 179 # -K Keyword frequency file 172 $keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K" if ($opt_K); 180 if (defined($opt_K)) { 181 $keyword_frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_K"; 182 } 173 183 if ($keyword_frequency_file) { 174 184 print STDERR "Keyword frequency file: $keyword_frequency_file\n"; 175 die "Keyword frequency file does not exist!\n" if (! (-e $keyword_frequency_file));185 die "Keyword frequency file does not exist!\n" if (! -e $keyword_frequency_file); 176 186 } else { 177 print STDERR "No keyword frequency file (default)\n";187 if ($opt_d) {print STDERR "No keyword frequency file (default)\n";} 178 188 } 179 189 180 190 # Count the number of files 181 191 $number_of_files = $#ARGV + 1; 182 print STDERR "Number of files: $number_of_files\n\n"; 192 if ($opt_d) {print STDERR "Number of files: $number_of_files\n\n";} 183 193 184 194 … … 197 207 198 208 # Process each input file into a working file 199 print STDERR "Preparing input files in: $data\n"; 209 if ($opt_d) {print STDERR "Preparing input files in: $data\n";} 200 210 201 211 for ($f = 0; $f <= $#ARGV; $f++) { 202 print STDERR " document ", ($f+1), ": $ARGV[$f]\ r";212 print STDERR " document ", ($f+1), ": $ARGV[$f]\n"; 203 213 204 214 $file = $ARGV[$f]; … … 233 243 $command .= " $data $arff"; 234 244 235 #print STDERR "** $command **\n"; 236 `$command`; 245 if ($opt_d) { 246 print STDERR "** $command **\n"; 247 } 248 system ("$command"); 237 249 238 250 … … 244 256 $command .= " > $out 2> $err"; 245 257 246 print STDERR "** $command **\n"; 247 `$command`; 258 if ($opt_d) { 259 print STDERR "** $command **\n"; 260 } 261 system ("$command"); 248 262 249 263 … … 288 302 289 303 #get rid of temporary files 290 #if (!$opt_d && !$opt_d) { 291 # `rm -r $data $arff $out $err`; 292 #} 293 294 295 296 297 298 299 304 if (!$opt_d) { 305 system ("rm", ("-r", "$data", "$arff", "$out", "$err")); 306 } -
trunk/gsdl/perllib/Kea-1.1.4/k4.pl
r1989 r4281 39 39 # 1.1.1 Tweaked output a little; no changes to function 40 40 41 print STDERR " \nk4.pl: the arff file builder for Kea\n";41 print STDERR "k4.pl: the arff file builder for Kea\n"; 42 42 43 43 $gsdlhome = $ENV{'GSDLHOME'}; … … 51 51 # -S <stopword file> 52 52 53 require("getopts.pl"); 54 &Getopts("abciL:f:FsS:kK:t"); 53 use Getopt::Std; 54 my $opt_i=0; 55 my $opt_t=0; 56 getopt("abciL:f:FsS:kK:t"); 57 my $debug=0; 55 58 56 59 # What files shall we use? … … 71 74 die "\n"; 72 75 } 73 $directory = $ARGV[0];76 my $directory = $ARGV[0]; 74 77 $directory =~ s/\/$//; 75 78 $arfffile = $ARGV[1]; 76 print STDERR "Input directory: $directory\n"; 77 print STDERR "Output Arff file: $arfffile\n"; 79 80 if ($debug) { 81 print STDERR "Input directory: $directory\n"; 82 print STDERR "Output Arff file: $arfffile\n"; 83 } 84 85 86 87 88 # Is the current phrase a valid Carl phrase? 89 # The phrase is storedin $phrase. 90 my ($NOUN, $ADJ, $VERB, $INCC, $VBG); # globals, set below 91 92 sub isValidCarlPhrase () { 93 @wds = split(/ +/, $phrase); 94 $index = 0; 95 $wd = $wds[$index]; 96 97 while ($index < $#wds) { 98 # current word must be a noun, adj, or verb 99 if (($wd =~ /^$NOUN$/) || ($wd =~ /^$ADJ$/) || ($wd =~ /^$VERB$/)) { 100 $index++; 101 $wd = $wds[$index]; 102 } else { 103 return 0; 104 } 105 106 # next is an optional incc 107 if ($wd =~ /^$INCC$/) { 108 # it is an incc, so advance one word 109 $index++; 110 $wd = $wds[$index]; 111 } 112 } 113 114 # since we can advance two in the loop, it's possible to have 115 # run out of input. If this is the case, then the phrase is 116 # not brill, as there's no room for a final NN or VBG 117 if ($index > $#wds) { 118 return 0; 119 } 120 121 # the last word must be either a noun or a vbg 122 if (($wd =~ /^$VBG$/) || ($wd =~ /^$NOUN$/)) { 123 return 1; 124 } else { 125 return 0; 126 } 127 } 128 129 130 131 78 132 79 133 … … 84 138 $max_phrase_length = 3; 85 139 } 86 print STDERR "Maximum phrase length: $max_phrase_length\n"; 87 140 141 if ($debug) { 142 print STDERR "Maximum phrase length: $max_phrase_length\n"; 143 } 88 144 89 145 # Are we in Stopword mode, Brill mode, or Carl mode? … … 93 149 } elsif ($opt_b) { 94 150 # Brill phrases 95 print STDERR "Brill phrase mode\n";151 if ($debug) {print STDERR "Brill phrase mode\n";} 96 152 $brill_mode = 1; 97 153 … … 101 157 } elsif ($opt_c) { 102 158 # Carl phrases 103 print STDERR "Carl phrase mode\n";159 if ($debug) {print STDERR "Carl phrase mode\n";} 104 160 $carl_mode = 1; 105 161 … … 108 164 $NOUN = "($WORD\/NN[A-Z]*)"; 109 165 $ADJ = "($WORD\/JJ[A-Z]*)"; 110 $VBG = "( $WORD\ing\/VBG)";111 $VBN = "( $WORD\ed\/VBN)";166 $VBG = "({$WORD}ing\/VBG)"; 167 $VBN = "({$WORD}ed\/VBN)"; 112 168 113 169 $VB = "($WORD\/VB)"; … … 139 195 } 140 196 } 141 print STDERR "Using stopword file: $stopword_file\n"; 197 if ($debug) { 198 print STDERR "Using stopword file: $stopword_file\n"; 199 } 142 200 143 201 # read the stopwords … … 152 210 153 211 # Should we ignore stopwords in word counts? 154 if ($opt_i && $opt_i) {212 if ($opt_i) { 155 213 print STDERR "Ignoring stopwords in word counts.\n"; 156 214 $use_all_words_in_word_count = 0; … … 167 225 $document_frequency_file = "document-frequencies"; 168 226 } 169 print STDERR "Document frequency file: $document_frequency_file\n"; 227 228 if ($debug) { 229 print STDERR "Document frequency file: $document_frequency_file\n"; 230 } 170 231 171 232 … … 177 238 178 239 if ($opt_F && (-e "$document_frequency_file")) { 179 print STDERR "Text files are covered by specified document frequency file\n"; 240 if ($debug) { 241 print STDERR "Text files are covered by specified document " 242 . "frequency file\n"; 243 } 180 244 $testfile = 0; 181 245 } elsif ($opt_F) { 182 246 die "Document frequency file doesn't exist, -F option impossible\n"; 183 247 } elsif (-e "$document_frequency_file") { 184 print STDERR "Text files are not covered by document frequency file\n"; 248 if ($debug) { 249 print STDERR "Text files are not covered by document frequency file\n"; 250 } 185 251 $testfile = 1; 186 252 } else { … … 192 258 if ($opt_K && (-e $opt_K)) { 193 259 $keyword_frequency_file = $opt_K; 194 print STDERR "Keyword frequency file: $keyword_frequency_file\n"; 260 if ($debug) { 261 print STDERR "Keyword frequency file: $keyword_frequency_file\n"; 262 } 195 263 if ($opt_k && $opt_k) { 196 print STDERR "Keyword frequency file covers input files\n"; 264 if ($debug) { 265 print STDERR "Keyword frequency file covers input files\n"; 266 } 197 267 $kf_covers_input = 1; 198 268 } else { 199 print STDERR "Keyword frequency is independent of input\n"; 269 if ($debug) { 270 print STDERR "Keyword frequency is independent of input\n"; 271 } 200 272 $kf_covers_input = 0; 201 273 } … … 204 276 } else { 205 277 $keyword_frequency_file = ""; 206 print STDERR "No keyword frequency file\n";278 if ($debug) {print STDERR "No keyword frequency file\n";} 207 279 } 208 280 … … 214 286 } else { 215 287 $suppress_singleton_phrases = 1; 216 print STDERR "Ignoring singleton phrases (default).\n"288 if ($debug) {print STDERR "Ignoring singleton phrases (default).\n"} 217 289 } 218 290 … … 220 292 # Do we look for *.tagged or *.clauses? 221 293 $suffix = "clauses"; 222 if ($opt_t && $opt_t) {294 if ($opt_t) { 223 295 $suffix = "tagged"; 224 296 } 225 print STDERR "Input file suffix: $suffix\n"; 297 298 if ($debug) { 299 print STDERR "Input file suffix: $suffix\n"; 300 } 226 301 227 302 # How to run scripts etc 228 $perl_command = "perl -w"; 303 my $perl_command = "perl"; 304 my $perl_args = ("-w"); # a list 229 305 230 306 # Are we using Turney's data … … 241 317 die "\nk4.pl error: no documents found."; 242 318 } else { 243 print STDERR " \nProducing keyphrases for ", $#documents + 1, " documents\n";244 } 245 246 print STDERR "Finding candidate phrases...\n"; 319 print STDERR "Producing keyphrases for ", $#documents + 1, " documents\n"; 320 } 321 322 if ($debug) {print STDERR "Finding candidate phrases...\n";} 247 323 $document_number = 1; 248 324 249 325 foreach $document (@documents) { 250 print STDERR " document $document_number: $document\r";326 if ($debug) {print STDERR " document $document_number: $document\n";} 251 327 $document_number++; 252 328 … … 371 447 372 448 $document_size{$document} = $distance; #WILL CHANGE THIS BACK 373 `$perl_command /home/jmt14/gsdl/perllib/Kea-1.1.4/kea-choose-best-phrase.pl $document`; 449 450 system("$perl_command", ($perl_args, "$gsdlhome/perllib/Kea-1.1.4/kea-choose-best-phrase.pl", "$document")); 374 451 } 375 452 … … 394 471 # Calculate document frequencies 395 472 396 print STDERR "Gathering document frequencies...\n"; 473 if ($debug) {print STDERR "Gathering document frequencies...\n";} 397 474 398 475 if (-e "$document_frequency_file") { 399 400 401 476 if ($debug) { 477 print STDERR "Found document frequencies -- reading them!\n"; 478 } 402 479 open(F, "<$document_frequency_file"); 403 480 … … 426 503 $numdocs_in_global_corpus = $#documents + 1; 427 504 428 print STDERR "Writing document frequencies to file...\n";505 if ($debug) {print STDERR "Writing document frequencies to file...\n";} 429 506 open(F, ">$document_frequency_file"); 430 507 print F "$numdocs_in_global_corpus\n"; … … 443 520 # Read the keyword frequency file 444 521 if ($keyword_frequency_file) { 445 print STDERR "Reading keyword frequency file\n";522 if ($debug) {print STDERR "Reading keyword frequency file\n";} 446 523 447 524 open(KF, "<$keyword_frequency_file"); … … 449 526 $size_of_keyword_frequency_file = <KF>; 450 527 chomp($size_of_keyword_frequency_file); 451 print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n"; 528 if ($debug) { 529 print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n"; 530 } 452 531 453 532 while (<KF>) { … … 483 562 # Write the arff files 484 563 485 print STDERR "Writing ARFF file\n"; 564 if ($debug) {print STDERR "Writing ARFF file\n";} 486 565 487 566 open (ARFF, ">$arfffile"); … … 515 594 516 595 # the number of keyphrases not covered by the arff file 517 print STDERR "Writing instances...\n"; 596 if ($debug) {print STDERR "Writing instances...\n";} 518 597 $not_included = 0; 519 598 $document_number = 1; 520 599 521 600 foreach $document (@documents) { 522 print STDERR " document $document_number: $document\r";601 if ($debug) {print STDERR " document $document_number: $document\n";} 523 602 $document_number++; 524 603 … … 767 846 768 847 769 print STDERR "k4.pl: $arfffile complete\n\n"; 770 771 772 773 774 775 # Is the current phrase a valid Carl phrase? 776 # The phrase is storedin $phrase. 777 778 779 sub isValidCarlPhrase () { 780 @wds = split(/ +/, $phrase); 781 $index = 0; 782 $wd = $wds[$index]; 783 784 while ($index < $#wds) { 785 # current word must be a noun, adj, or verb 786 if (($wd =~ /^$NOUN$/) || ($wd =~ /^$ADJ$/) || ($wd =~ /^$VERB$/)) { 787 $index++; 788 $wd = $wds[$index]; 789 } else { 790 return 0; 791 } 792 793 # next is an optional incc 794 if ($wd =~ /^$INCC$/) { 795 # it is an incc, so advance one word 796 $index++; 797 $wd = $wds[$index]; 798 } 799 } 800 801 # since we can advance two in the loop, it's possible to have 802 # run out of input. If this is the case, then the phrase is 803 # not brill, as there's no room for a final NN or VBG 804 if ($index > $#wds) { 805 return 0; 806 } 807 808 # the last word must be either a noun or a vbg 809 if (($wd =~ /^$VBG$/) || ($wd =~ /^$NOUN$/)) { 810 return 1; 811 } else { 812 return 0; 813 } 814 } 848 if ($debug) {print STDERR "k4.pl: $arfffile complete\n\n";}
Note:
See TracChangeset
for help on using the changeset viewer.