Changeset 4281


Ignore:
Timestamp:
2003-05-14T16:16:07+12:00 (21 years ago)
Author:
jrm21
Message:

1) use Getopts::Std instead of getopts.pl
2) Only print informative messages if debug is set
Kea only:
3) only compile stemmer if it isn't already there.
4) remove temp files unless debug is set
k4.pl only:
5) need to move isValidCarlPhrase() before first call

Location:
trunk/gsdl/perllib/Kea-1.1.4
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/Kea-1.1.4/Kea

    r1995 r4281  
    4444# 1.1.4 Updated documentation and added a few extra files.
    4545
    46 print STDERR "\nKea (version 1.1.4): automatic keyphrase extraction\n";
     46print STDERR "Kea (version 1.1.4): automatic keyphrase extraction\n";
    4747
    4848$gsdlhome = $ENV{'GSDLHOME'};
    4949
    50 `gcc -o $gsdlhome/perllib/Kea-1.1.4/stemmer $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c $gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c`;
    51 
     50if (! -x "$gsdlhome/perllib/Kea-1.1.4/stemmer") {
     51    print STDERR "** need to compile stemmer **\n";
     52    system("gcc", ("-o", "$gsdlhome/perllib/Kea-1.1.4/stemmer",
     53           "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem-Lovins-iterated.c",
     54           "$gsdlhome/perllib/Kea-1.1.4/Iterated-Lovins-stemmer/stem.c"
     55           ));
     56}
    5257
    5358# Kea runs other perl scripts in shells.
     
    8085
    8186# Parse command line options
    82 require("getopts.pl");
    83 &Getopts("dtN:E:C:F:K:L:M:S:");
     87use Getopt::Std;
     88my $opt_d=0; # debug option
     89getopts("dtN:E:C:F:K:L:M:S:");
    8490
    8591# What files shall we use?
     
    103109
    104110# Number of phrases to extract
    105 if (($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
     111if (defined($opt_N) && ($opt_N =~ /^\d+$/) && ($opt_N > 0)) {
    106112    $number_of_phrases = "-N $opt_N";
    107113    print STDERR "Number of phrases to extract: $opt_N\n";
    108 } elsif ($opt_N) {
     114} elsif (defined($opt_N)) {
    109115    die "Kea cannot understand -N argument (must be a number): $opt_N\n";
    110116} else {
    111     print STDERR "Number of phrases dictated by model (default)\n";
     117    if ($opt_d) {
     118    print STDERR "Number of phrases dictated by model (default)\n";
     119    }
    112120    $number_of_phrases = "";
    113121}
     
    115123# -L maximum phrase length
    116124$maximum_phrase_length = 0;
    117 if (($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
     125if (defined($opt_L) && ($opt_L =~ /^\d+$/) && ($opt_L > 0)) {
    118126    $maximum_phrase_length = "$opt_L";
    119127    print STDERR "Maximum phrase length: $opt_L\n";
    120 } elsif ($opt_L) {
     128} elsif (defined($opt_L)) {
    121129    die "Kea cannot understand -L argument (must be a number): $opt_L\n";
    122130}
     
    124132# -E What extension shall we use?
    125133$extension = "kea";
    126 if ($opt_E) {
     134if (defined($opt_E)) {
    127135    $extension = $opt_E;
    128136    $extension =~ s/^\.//g;
     
    131139
    132140# -t Should we output tfidf?  (This option is used by Kniles.)
    133 if ($opt_t && $opt_t) {
     141if (defined($opt_t) && $opt_t) {
    134142    $output_tfidf = 1;
    135143    print STDERR "Do print tf.idf\n";
     
    141149# -C Corpus file stem
    142150$default_stem = "$gsdlhome/perllib/Kea-1.1.4/aliweb";
    143 if ($opt_C) {
     151if (defined($opt_C)) {
    144152    print STDERR "Corpus: $opt_C (setting default model/stopwords/df)\n";
    145153    $default_stem = "$gsdlhome/perllib/Kea-1.1.4/$opt_C";
     
    155163
    156164# -F Document Frequency file
    157 $frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F" if ($opt_F);
    158 print STDERR "Document frequency file: $frequency_file\n";
     165if (defined($opt_F)) {$frequency_file = "$gsdlhome/perllib/Kea-1.1.4/$opt_F";}
     166if ($opt_d) {print STDERR "Document frequency file: $frequency_file\n";}
    159167die "Document frequency file does not exist!\n" if (!(-e $frequency_file));
    160168
    161169# -M Model file
    162 $model_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_M" if ($opt_M);
    163 print STDERR "Model file: $model_file\n";
     170if (defined($opt_M)) {$model_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_M";}
     171if ($opt_d) {print STDERR "Model file: $model_file\n";}
    164172die "Model file does not exist!\n" if (!(-e $model_file));
    165173
    166174# -S Stopword file
    167 $stopword_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_S" if ($opt_S);
    168 print STDERR "Stopword file: $stopword_file\n";
     175if (defined($opt_S)) {$stopword_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_S";}
     176if ($opt_d) {print STDERR "Stopword file: $stopword_file\n";}
    169177die "Stopword file does not exist!\n" if (!(-e $stopword_file));
    170178
    171179# -K Keyword frequency file
    172 $keyword_frequency_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_K" if ($opt_K);
     180if (defined($opt_K)) {
     181    $keyword_frequency_file =  "$gsdlhome/perllib/Kea-1.1.4/$opt_K";
     182}
    173183if ($keyword_frequency_file) {
    174184    print STDERR "Keyword frequency file: $keyword_frequency_file\n";
    175     die "Keyword frequency file does not exist!\n" if (!(-e $keyword_frequency_file));
     185    die "Keyword frequency file does not exist!\n" if (! -e $keyword_frequency_file);
    176186} else {
    177     print STDERR "No keyword frequency file (default)\n";
     187    if ($opt_d) {print STDERR "No keyword frequency file (default)\n";}
    178188}
    179189
    180190# Count the number of files
    181191$number_of_files = $#ARGV + 1;
    182 print STDERR "Number of files: $number_of_files\n\n";
     192if ($opt_d) {print STDERR "Number of files: $number_of_files\n\n";}
    183193
    184194
     
    197207
    198208# Process each input file into a working file
    199 print STDERR "Preparing input files in: $data\n";
     209if ($opt_d) {print STDERR "Preparing input files in: $data\n";}
    200210
    201211for ($f = 0; $f <= $#ARGV; $f++) {
    202     print STDERR "  document ", ($f+1), ": $ARGV[$f]\r";
     212    print STDERR "  document ", ($f+1), ": $ARGV[$f]\n";
    203213
    204214    $file = $ARGV[$f];
     
    233243$command .= " $data $arff";
    234244
    235 #print STDERR "** $command **\n";
    236 `$command`;
     245if ($opt_d) {
     246    print STDERR "** $command **\n";
     247}
     248system ("$command");
    237249
    238250
     
    244256$command .= " > $out 2> $err";
    245257
    246 print STDERR "** $command  **\n";
    247 `$command`;
     258if ($opt_d) {
     259    print STDERR "** $command  **\n";
     260}
     261system ("$command");
    248262
    249263
     
    288302
    289303#get rid of temporary files
    290 #if (!$opt_d && !$opt_d) {
    291 #  `rm -r $data $arff $out $err`;
    292 #}
    293 
    294 
    295 
    296 
    297 
    298 
    299 
     304if (!$opt_d) {
     305    system ("rm", ("-r", "$data", "$arff", "$out", "$err"));
     306}
  • trunk/gsdl/perllib/Kea-1.1.4/k4.pl

    r1989 r4281  
    3939# 1.1.1  Tweaked output a little; no changes to function
    4040
    41 print STDERR "\nk4.pl: the arff file builder for Kea\n";
     41print STDERR "k4.pl: the arff file builder for Kea\n";
    4242
    4343$gsdlhome = $ENV{'GSDLHOME'};
     
    5151#         -S <stopword file>
    5252
    53 require("getopts.pl");
    54 &Getopts("abciL:f:FsS:kK:t");
     53use Getopt::Std;
     54my $opt_i=0;
     55my $opt_t=0;
     56getopt("abciL:f:FsS:kK:t");
     57my $debug=0;
    5558
    5659# What files shall we use?
     
    7174    die "\n";
    7275}
    73 $directory = $ARGV[0];
     76my $directory = $ARGV[0];
    7477$directory =~ s/\/$//;
    7578$arfffile = $ARGV[1];
    76 print STDERR "Input directory: $directory\n";
    77 print STDERR "Output Arff file: $arfffile\n";
     79
     80if ($debug) {
     81    print STDERR "Input directory: $directory\n";
     82    print STDERR "Output Arff file: $arfffile\n";
     83}
     84
     85
     86
     87
     88# Is the current phrase a valid Carl phrase?
     89# The phrase is storedin $phrase.
     90my ($NOUN, $ADJ, $VERB, $INCC, $VBG); # globals, set below
     91
     92sub isValidCarlPhrase () {
     93    @wds = split(/ +/, $phrase);
     94    $index = 0;
     95    $wd = $wds[$index];
     96
     97    while ($index < $#wds) {
     98    # current word must be a noun, adj, or verb
     99    if (($wd =~ /^$NOUN$/) || ($wd =~ /^$ADJ$/) || ($wd =~ /^$VERB$/)) {
     100        $index++;
     101        $wd = $wds[$index];
     102    } else {
     103        return 0;
     104    }
     105
     106    # next is an optional incc
     107    if ($wd =~ /^$INCC$/) {
     108        # it is an incc, so advance one word
     109        $index++;
     110        $wd = $wds[$index];
     111    }
     112    }
     113
     114    # since we can advance two in the loop, it's possible to have
     115    # run out of input. If this is the case, then the phrase is
     116    # not brill, as there's no room for a final NN or VBG
     117    if ($index > $#wds) {
     118    return 0;
     119    }
     120
     121    # the last word must be either a noun or a vbg
     122    if (($wd =~ /^$VBG$/) || ($wd =~ /^$NOUN$/)) {
     123    return 1;
     124    } else {
     125    return 0;
     126    }
     127}
     128
     129
     130
     131
    78132
    79133
     
    84138    $max_phrase_length = 3;
    85139}
    86 print STDERR "Maximum phrase length: $max_phrase_length\n";
    87 
     140
     141if ($debug) {
     142    print STDERR "Maximum phrase length: $max_phrase_length\n";
     143}
    88144
    89145# Are we in Stopword mode, Brill mode, or Carl mode?
     
    93149} elsif ($opt_b) {
    94150    # Brill phrases
    95     print STDERR "Brill phrase mode\n";
     151    if ($debug) {print STDERR "Brill phrase mode\n";}
    96152    $brill_mode = 1;
    97153
     
    101157} elsif ($opt_c) {
    102158    # Carl phrases
    103     print STDERR "Carl phrase mode\n";
     159    if ($debug) {print STDERR "Carl phrase mode\n";}
    104160    $carl_mode = 1;
    105161
     
    108164    $NOUN = "($WORD\/NN[A-Z]*)";
    109165    $ADJ = "($WORD\/JJ[A-Z]*)";
    110     $VBG = "($WORD\ing\/VBG)";
    111     $VBN = "($WORD\ed\/VBN)";
     166    $VBG = "({$WORD}ing\/VBG)";
     167    $VBN = "({$WORD}ed\/VBN)";
    112168
    113169    $VB = "($WORD\/VB)";
     
    139195    }
    140196    }
    141     print STDERR "Using stopword file: $stopword_file\n";
     197    if ($debug) {
     198    print STDERR "Using stopword file: $stopword_file\n";
     199    }
    142200
    143201    # read the stopwords
     
    152210
    153211# Should we ignore stopwords in word counts?
    154 if ($opt_i && $opt_i) {
     212if ($opt_i) {
    155213    print STDERR "Ignoring stopwords in word counts.\n";
    156214    $use_all_words_in_word_count = 0;
     
    167225    $document_frequency_file = "document-frequencies";
    168226}
    169 print STDERR "Document frequency file: $document_frequency_file\n";
     227
     228if ($debug) {
     229    print STDERR "Document frequency file: $document_frequency_file\n";
     230}
    170231
    171232
     
    177238
    178239if ($opt_F && (-e "$document_frequency_file")) {
    179     print STDERR "Text files are covered by specified document frequency file\n";
     240    if ($debug) {
     241    print STDERR "Text files are covered by specified document "
     242        . "frequency file\n";
     243    }
    180244    $testfile = 0;
    181245} elsif ($opt_F) {
    182246    die "Document frequency file doesn't exist, -F option impossible\n";
    183247} elsif (-e "$document_frequency_file") {
    184     print STDERR "Text files are not covered by document frequency file\n";
     248    if ($debug) {
     249    print STDERR "Text files are not covered by document frequency file\n";
     250    }
    185251    $testfile = 1;
    186252} else {
     
    192258if ($opt_K && (-e $opt_K)) {
    193259    $keyword_frequency_file = $opt_K;
    194     print STDERR "Keyword frequency file: $keyword_frequency_file\n";
     260    if ($debug) {
     261    print STDERR "Keyword frequency file: $keyword_frequency_file\n";
     262    }
    195263    if ($opt_k && $opt_k) {
    196     print STDERR "Keyword frequency file covers input files\n";
     264    if ($debug) {
     265        print STDERR "Keyword frequency file covers input files\n";
     266    }
    197267    $kf_covers_input = 1;
    198268    } else {
    199     print STDERR "Keyword frequency is independent of input\n";
     269    if ($debug) {
     270        print STDERR "Keyword frequency is independent of input\n";
     271    }
    200272    $kf_covers_input = 0;
    201273    }
     
    204276} else {
    205277    $keyword_frequency_file = "";
    206     print STDERR "No keyword frequency file\n";
     278    if ($debug) {print STDERR "No keyword frequency file\n";}
    207279}
    208280
     
    214286} else {
    215287    $suppress_singleton_phrases = 1;
    216     print STDERR "Ignoring singleton phrases (default).\n"
     288    if ($debug) {print STDERR "Ignoring singleton phrases (default).\n"}
    217289}
    218290
     
    220292# Do we look for *.tagged or *.clauses?
    221293$suffix = "clauses";
    222 if ($opt_t && $opt_t) {
     294if ($opt_t) {
    223295    $suffix = "tagged";
    224296}
    225 print STDERR "Input file suffix: $suffix\n";
     297
     298if ($debug) {
     299    print STDERR "Input file suffix: $suffix\n";
     300}
    226301
    227302# How to run scripts etc
    228 $perl_command = "perl -w";
     303my $perl_command = "perl";
     304my $perl_args = ("-w"); # a list
    229305
    230306# Are we using Turney's data
     
    241317    die "\nk4.pl error: no documents found.";
    242318} else {
    243     print STDERR "\nProducing keyphrases for ", $#documents + 1, " documents\n";
    244 }
    245 
    246 print STDERR "Finding candidate phrases...\n";
     319    print STDERR "Producing keyphrases for ", $#documents + 1, " documents\n";
     320}
     321
     322if ($debug) {print STDERR "Finding candidate phrases...\n";}
    247323$document_number = 1;
    248324
    249325foreach $document (@documents) {
    250     print STDERR "  document $document_number: $document\r";
     326    if ($debug) {print STDERR "  document $document_number: $document\n";}
    251327    $document_number++;
    252328
     
    371447
    372448    $document_size{$document} = $distance; #WILL CHANGE THIS BACK
    373     `$perl_command /home/jmt14/gsdl/perllib/Kea-1.1.4/kea-choose-best-phrase.pl $document`;
     449
     450    system("$perl_command", ($perl_args, "$gsdlhome/perllib/Kea-1.1.4/kea-choose-best-phrase.pl", "$document"));
    374451}
    375452
     
    394471# Calculate document frequencies
    395472
    396 print STDERR "Gathering document frequencies...\n";
     473if ($debug) {print STDERR "Gathering document frequencies...\n";}
    397474
    398475if (-e "$document_frequency_file") {
    399    
    400     print STDERR "Found document frequencies -- reading them!\n";
    401    
     476    if ($debug) {
     477    print STDERR "Found document frequencies -- reading them!\n";
     478    }
    402479    open(F, "<$document_frequency_file");
    403480
     
    426503    $numdocs_in_global_corpus = $#documents + 1;
    427504   
    428     print STDERR "Writing document frequencies to file...\n";
     505    if ($debug) {print STDERR "Writing document frequencies to file...\n";}
    429506    open(F, ">$document_frequency_file");
    430507    print F "$numdocs_in_global_corpus\n";
     
    443520# Read the keyword frequency file
    444521if ($keyword_frequency_file) {
    445     print STDERR "Reading keyword frequency file\n";
     522    if ($debug) {print STDERR "Reading keyword frequency file\n";}
    446523   
    447524    open(KF, "<$keyword_frequency_file");
     
    449526    $size_of_keyword_frequency_file = <KF>;
    450527    chomp($size_of_keyword_frequency_file);
    451     print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n";
     528    if ($debug) {
     529    print STDERR "$size_of_keyword_frequency_file documents used to generate kf file.\n";
     530    }
    452531
    453532    while (<KF>) {
     
    483562# Write the arff files
    484563
    485 print STDERR "Writing ARFF file\n";
     564if ($debug) {print STDERR "Writing ARFF file\n";}
    486565
    487566open (ARFF, ">$arfffile");
     
    515594
    516595# the number of keyphrases not covered by the arff file
    517 print STDERR "Writing instances...\n";
     596if ($debug) {print STDERR "Writing instances...\n";}
    518597$not_included = 0;
    519598$document_number = 1;
    520599
    521600foreach $document (@documents) {
    522     print STDERR "  document $document_number: $document\r";
     601    if ($debug) {print STDERR "  document $document_number: $document\n";}
    523602    $document_number++;
    524603
     
    767846
    768847
    769 print STDERR "k4.pl: $arfffile complete\n\n";
    770 
    771 
    772 
    773 
    774 
    775 # Is the current phrase a valid Carl phrase?
    776 # The phrase is storedin $phrase.
    777 
    778 
    779 sub isValidCarlPhrase () {
    780     @wds = split(/ +/, $phrase);
    781     $index = 0;
    782     $wd = $wds[$index];
    783 
    784     while ($index < $#wds) {
    785     # current word must be a noun, adj, or verb
    786     if (($wd =~ /^$NOUN$/) || ($wd =~ /^$ADJ$/) || ($wd =~ /^$VERB$/)) {
    787         $index++;
    788         $wd = $wds[$index];
    789     } else {
    790         return 0;
    791     }
    792 
    793     # next is an optional incc
    794     if ($wd =~ /^$INCC$/) {
    795         # it is an incc, so advance one word
    796         $index++;
    797         $wd = $wds[$index];
    798     }
    799     }
    800 
    801     # since we can advance two in the loop, it's possible to have
    802     # run out of input. If this is the case, then the phrase is
    803     # not brill, as there's no room for a final NN or VBG
    804     if ($index > $#wds) {
    805     return 0;
    806     }
    807 
    808     # the last word must be either a noun or a vbg
    809     if (($wd =~ /^$VBG$/) || ($wd =~ /^$NOUN$/)) {
    810     return 1;
    811     } else {
    812     return 0;
    813     }
    814 }
     848if ($debug) {print STDERR "k4.pl: $arfffile complete\n\n";}
Note: See TracChangeset for help on using the changeset viewer.