Changeset 8814


Ignore:
Timestamp:
2004-12-15T14:10:41+13:00 (19 years ago)
Author:
mdewsnip
Message:

Updated files for Kea 3.0, thanks to Olena.

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/Kea.pm

    r6792 r8814  
    11package Kea;
    2 
    3 use strict;
    42
    53# This function is called by BasPlug.pm when a flag in a collection
     
    97# how the keyphrase data is to be collected if the keyphrase option flag was
    108# set in the collection configuration file.  This module then writes the
    11 # documents text to a file because the stand-alone program Kea which will be
    12 # called to do the actual extraction of the keyphrases expects a file argument.
     9# documents text to a file in a temporary directory because the stand-alone program Kea which will be
     10# called to do the actual extraction of the keyphrases expects a directory with one or more files as argument.
    1311# Once Kea has been called upon, the file containing the keyphrase data
    1412# gathered by Kea should be stored in gsdl/tmp and this file is read, the data
     
    1816sub extract_KeyPhrases {
    1917
     18    # Parsing arguments of the function
     19    my $doc = shift(@_); # documents text 
     20    my $args = shift(@_); # any options
     21    my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options
     22
     23    # Specifying directory names
    2024    my $gsdlhome = $ENV{'GSDLHOME'};
    21     my $doc = shift(@_); #documents text 
    22     my $args = shift(@_); #any options
    23     my @optionlist = split(/ +/, $args) if (defined($args)); #list of options
    24     my $suffix = 'kea'; #default file will be called .kea
    25     my @kea_options;
     25    my $keahome = "$gsdlhome/packages/kea/kea-3.0";
     26    my $defaultmodel = "$keahome/CSTR";
     27   
     28    # Initializing variables:
     29    my $command = "";
    2630    my @keylist;
    27     my @stemlist;
     31    my @options = ();
     32    $modelspec = 0;
    2833
     34    # Settings for the java executable:
    2935
    30     foreach my $element (@optionlist){ #for each option
    31     my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist)
     36    # CLASSPATH:
     37    $java_classpath = ".:$keahome";
    3238
    33     $option  = "-".$option; #place dash in front of option
    34     push @kea_options, $option;
    35     if (defined($file)) {
    36         push @kea_options, $file;
     39    # See if java executable is on path
     40    my $java_exec="";
     41    if (system("which java >/dev/null 2>/dev/null")==0) {
     42    $java_exec=`which java`;
     43    chomp $java_exec;
     44    } else {
     45    $java_exec="$java_home/bin/java";
     46    }
     47   
     48    # The actual java command is based on these other variables:
     49    $java_command = "$java_exec -classpath \"$java_classpath\"";
     50   
     51    # end of java settings
     52
     53    # Parsing options for keyphrase extraction:
     54    if (@optionlist) {
     55    foreach $element (@optionlist){ #for each option
     56        if (length($element) == 1) {
     57        push(@options, "-$element");
     58        } else {
     59        $option = substr($element, 0, 1);
     60        $value = substr($element,1);
     61        if (($option eq "m") && (-e "$keahome/$value")) {
     62            $modelspec = 1;
     63            push(@options, "-$option $keahome/$value");
     64        } elsif ($option eq "m") {         
     65            $modelspec = 1;
     66            print STDERR "Couldn't find model $value. Using the default model instead\n";
     67            push(@options, "-$option $defaultmodel");
     68        } else {
     69            push(@options, "-$option $value");
     70        }
     71       
     72        }
    3773    }
    38 
    39     if ($option eq '-E')  # option is extension (suffix) option
    40         { $suffix = $file }
     74    # if none of the option specifies the model, set the default one:
     75    if ($modelspec != 1) {
     76        push(@options, "-m $defaultmodel");
     77    }
     78    $options = join(" ",@options);
     79    # print STDERR "OPTIONS: $options\n";
     80    } else {
     81    # If no options were specified: Set default value for the model
     82    $options = "-m $defaultmodel"; 
    4183    }
    4284
    43     # print STDERR "Using output suffix: $suffix\n";
     85    # Remove all HTML tags from the original text
     86    $doc =~ s/<P[^>]*>/\n/sgi;
     87    $doc =~ s/<H[^>]*>/\n/sgi;
     88    $doc =~ s/<[^>]*>//sgi;
     89    $doc =~ tr/\n/\n/s;
    4490
    45     # remove all HTML tags
    46     $doc =~ s/<[ph][^>]*>/\n/sgi; # replace headings/paragraphs with newline
    47     $doc =~ s/<[^>]*>/ /sgi; # replace all others with a space
    48 
    49     # &gt; lt amp
    50     $doc =~ s/\&(?:gt|lt|amp)\;/ /gi;
    51 
    52     my $tmpfile="$gsdlhome/tmp/doc.txt";
    53     open(OUT, ">$tmpfile") or die "Kea.pm could not create doc.txt: $!\n"; 
     91    # Write text to a temporary file doc.txt
     92    open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; 
    5493    print OUT $doc;
    5594    close(OUT);
    5695
    57     # call Kea with specifed options
    58     system("$gsdlhome/perllib/Kea-1.1.4/Kea", @kea_options,
    59        $tmpfile);
    6096
    61     unlink($tmpfile); # don't need this file anymore
     97    # EXECUTE KEA with specific options:
     98    $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options";
     99    system ("$command");
    62100
    63     # read doc.kea with keywords
    64     my $inputfile="$gsdlhome/tmp/doc.$suffix";
     101    # Read the resulting doc.key, which contains keyphrases:
    65102
    66     # If this file doesn't exist, then either an option was wrongly specified
    67     # or no keyphrases were found
    68     open(IN, "<$inputfile") or return ();
    69 
     103    open(IN, "<$gsdlhome/tmp/doc.key") or return @emptykeylist;
     104                                          #this means doc.key does not exist
     105                                          #either because an option was wrongly specified
     106                                      #or no keyphrases were found
    70107    while(<IN>){
    71108    chomp;
    72     my @key = split(/\t/); #split into array separated by a tab
    73     push(@keylist, $key[0]); #add to list of keywords
    74     push(@stemlist, $key[1]); #add to list of stems 
     109    push(@keylist,$_);
    75110    }
    76111    close(IN);
    77112
    78     #put data into appropriate format
    79     my $keylistref = join(", ", @keylist);
    80     my $stemlistref = join(", ", @stemlist);   
    81  
    82     # delete doc.extension so that in future it will not be opened and read
    83     unlink($inputfile);
     113    $keylist = join(", ", @keylist);
    84114
    85     # return keywords + stems to basplug
    86     return ($keylistref, $stemlistref);
     115    # Delete doc.key so that in future it will not be opened and read.
     116    # Otherwise KEA sees it as athor keyphrases!
     117
     118    `rm $gsdlhome/tmp/doc.key`;
     119   
     120    return $keylist;
    87121}
    88122
    89 
    90 
    911231;
    92 
    93 
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r8789 r8814  
    9696    'desc' => "{BasPlug.markup_acronyms}",
    9797    'type' => "flag",
    98     'reqd' => "no" },
     98    'reqd' => "no" }, 
    9999      { 'name' => "extract_keyphrases",
    100100    'desc' => "{BasPlug.extract_keyphrases}",
    101101    'type' => "flag",
    102     'reqd' => "no" }, 
     102    'reqd' => "no" },
    103103      { 'name' => "extract_keyphrase_options",
    104104    'desc' => "{BasPlug.extract_keyphrase_options}",
    105105    'type' => "string",
    106106    'deft' => "",
    107     'reqd' => "no" }, 
     107    'reqd' => "no" },
    108108      { 'name' => "first",
    109109    'desc' => "{BasPlug.first}",
     
    872872
    873873
    874 #adding kea keyphrases
     874    # adding kea keyphrases
     875
    875876    if ($self->{'kea'}) { 
    876877   
     
    879880    my @list;
    880881
    881     while (defined $thissection) { #loop through sections to gather whole doc
     882    #loop through sections to gather whole doc
     883    while (defined $thissection) {
    882884        my $sectiontext = $doc_obj->get_text($thissection);   
    883885        $text = $text.$sectiontext;
     
    885887    }
    886888       
    887     #if kea options flag is set, call Kea with specified options
     889
    888890    if($self->{'kea_options'}) {
    889         @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
    890     }
    891     #otherwise call Kea with no options
    892     else {
    893         @list = &Kea::extract_KeyPhrases ($text);
    894     }
    895      
    896     if(@list){ #if a list of kea keyphrases was returned (ie not empty)
    897         my $keyphrases = $list[0]; #first arg is keyphrase list
    898         my $stems = $list[1]; #second  arg is stemmed keyphrase list
    899         &gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n");
    900         # print STDERR "keyphrases: $keyphrases\n";
    901         &gsprintf(STDERR, "{BasPlug.stems}: $stems\n");
    902         # print STDERR "stems: $stems\n";
    903         $thissection = $doc_obj->get_top_section(); #add metadata to top section
     891        #if kea options flag is set, call Kea with specified options
     892        $list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
     893    } else {
     894        #otherwise call Kea with no options
     895        $list = &Kea::extract_KeyPhrases ($text);
     896    }
     897    if($list){
     898        # if a list of kea keyphrases was returned (ie not empty)
     899        &gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
     900
     901        #add metadata to top section
     902        $thissection = $doc_obj->get_top_section();
     903
    904904        # add all key phrases as one metadata
    905         $doc_obj->add_metadata($thissection, "Keyphrases", $keyphrases);
     905        $doc_obj->add_metadata($thissection, "Keyphrases", $list);
     906
    906907        # add individual key phrases as multiple metadata
    907         foreach my $keyphrase (split(',', $keyphrases)) {
    908             $keyphrase =~ s/^\s*//; $keyphrase =~ s/\s*$//;
     908        foreach my $keyphrase (split(',', $list)) {
     909            $keyphrase =~ s/^\s+|\s+$//g;
    909910        $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
    910911        }
    911         $doc_obj->add_metadata($thissection, "stems", $stems);
    912     }
    913     } #end of kea
     912    }
     913    }
     914 
     915    #end of kea
    914916
    915917    if ($self->{'first'}) {
  • trunk/gsdl/perllib/strings.rb

    r8796 r8814  
    529529BasPlug.extract_keyphrases:Extract keyphrases automatically with Kea (default settings).
    530530
    531 BasPlug.extract_keyphrase_options:Options for keyphrase extraction with Kea. For example: L2 - length of extracted keyphrases is 2 terms, N5 - 5 keyphrases to extract.
     531BasPlug.extract_keyphrase_options:Options for keyphrase extraction with Kea. For example: mALIWEB - use ALIWEB extraction model; n5 - extract 5 keyphrase;, eGBK - use GBK encoding.
    532532
    533533BasPlug.extracting_emails:extracting e-mail addresses
Note: See TracChangeset for help on using the changeset viewer.