Changeset 8814
- Timestamp:
- 2004-12-15T14:10:41+13:00 (19 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/Kea.pm
r6792 r8814 1 1 package Kea; 2 3 use strict;4 2 5 3 # This function is called by BasPlug.pm when a flag in a collection … … 9 7 # how the keyphrase data is to be collected if the keyphrase option flag was 10 8 # set in the collection configuration file. This module then writes the 11 # documents text to a file because the stand-alone program Kea which will be12 # called to do the actual extraction of the keyphrases expects a fileargument.9 # documents text to a file in a temporary directory because the stand-alone program Kea which will be 10 # called to do the actual extraction of the keyphrases expects a directory with one or more files as argument. 13 11 # Once Kea has been called upon, the file containing the keyphrase data 14 12 # gathered by Kea should be stored in gsdl/tmp and this file is read, the data … … 18 16 sub extract_KeyPhrases { 19 17 18 # Parsing arguments of the function 19 my $doc = shift(@_); # documents text 20 my $args = shift(@_); # any options 21 my @optionlist = split(/\s+/, $args) if (defined($args)); #list of options 22 23 # Specifying directory names 20 24 my $gsdlhome = $ENV{'GSDLHOME'}; 21 my $ doc = shift(@_); #documents text22 my $ args = shift(@_); #any options23 my @optionlist = split(/ +/, $args) if (defined($args)); #list of options24 my $suffix = 'kea'; #default file will be called .kea25 my @kea_options;25 my $keahome = "$gsdlhome/packages/kea/kea-3.0"; 26 my $defaultmodel = "$keahome/CSTR"; 27 28 # Initializing variables: 29 my $command = ""; 26 30 my @keylist; 27 my @stemlist; 31 my @options = (); 32 $modelspec = 0; 28 33 34 # Settings for the java executable: 29 35 30 foreach my $element (@optionlist){ #for each option31 my ($option, $file) = split(/,/, $element); #split option letter and file (if file exist) 36 # CLASSPATH: 37 $java_classpath = ".:$keahome"; 32 38 33 $option = "-".$option; #place dash in front of option 34 push @kea_options, $option; 35 if (defined($file)) { 36 push @kea_options, $file; 39 # See if java executable is on path 40 my $java_exec=""; 41 if (system("which java >/dev/null 2>/dev/null")==0) { 42 $java_exec=`which java`; 43 chomp $java_exec; 44 } else { 45 $java_exec="$java_home/bin/java"; 46 } 47 48 # The actual java command is based on these other variables: 49 $java_command = "$java_exec -classpath \"$java_classpath\""; 50 51 # end of java settings 52 53 # Parsing options for keyphrase extraction: 54 if (@optionlist) { 55 foreach $element (@optionlist){ #for each option 56 if (length($element) == 1) { 57 push(@options, "-$element"); 58 } else { 59 $option = substr($element, 0, 1); 60 $value = substr($element,1); 61 if (($option eq "m") && (-e "$keahome/$value")) { 62 $modelspec = 1; 63 push(@options, "-$option $keahome/$value"); 64 } elsif ($option eq "m") { 65 $modelspec = 1; 66 print STDERR "Couldn't find model $value. Using the default model instead\n"; 67 push(@options, "-$option $defaultmodel"); 68 } else { 69 push(@options, "-$option $value"); 70 } 71 72 } 37 73 } 38 39 if ($option eq '-E') # option is extension (suffix) option 40 { $suffix = $file } 74 # if none of the option specifies the model, set the default one: 75 if ($modelspec != 1) { 76 push(@options, "-m $defaultmodel"); 77 } 78 $options = join(" ",@options); 79 # print STDERR "OPTIONS: $options\n"; 80 } else { 81 # If no options were specified: Set default value for the model 82 $options = "-m $defaultmodel"; 41 83 } 42 84 43 # print STDERR "Using output suffix: $suffix\n"; 85 # Remove all HTML tags from the original text 86 $doc =~ s/<P[^>]*>/\n/sgi; 87 $doc =~ s/<H[^>]*>/\n/sgi; 88 $doc =~ s/<[^>]*>//sgi; 89 $doc =~ tr/\n/\n/s; 44 90 45 # remove all HTML tags 46 $doc =~ s/<[ph][^>]*>/\n/sgi; # replace headings/paragraphs with newline 47 $doc =~ s/<[^>]*>/ /sgi; # replace all others with a space 48 49 # > lt amp 50 $doc =~ s/\&(?:gt|lt|amp)\;/ /gi; 51 52 my $tmpfile="$gsdlhome/tmp/doc.txt"; 53 open(OUT, ">$tmpfile") or die "Kea.pm could not create doc.txt: $!\n"; 91 # Write text to a temporary file doc.txt 92 open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; 54 93 print OUT $doc; 55 94 close(OUT); 56 95 57 # call Kea with specifed options58 system("$gsdlhome/perllib/Kea-1.1.4/Kea", @kea_options,59 $tmpfile);60 96 61 unlink($tmpfile); # don't need this file anymore 97 # EXECUTE KEA with specific options: 98 $command = "$java_command KEAKeyphraseExtractor -l $gsdlhome/tmp $options"; 99 system ("$command"); 62 100 63 # read doc.kea with keywords 64 my $inputfile="$gsdlhome/tmp/doc.$suffix"; 101 # Read the resulting doc.key, which contains keyphrases: 65 102 66 # If this file doesn't exist, then either an option was wrongly specified67 # or no keyphrases were found68 open(IN, "<$inputfile") or return ();69 103 open(IN, "<$gsdlhome/tmp/doc.key") or return @emptykeylist; 104 #this means doc.key does not exist 105 #either because an option was wrongly specified 106 #or no keyphrases were found 70 107 while(<IN>){ 71 108 chomp; 72 my @key = split(/\t/); #split into array separated by a tab 73 push(@keylist, $key[0]); #add to list of keywords 74 push(@stemlist, $key[1]); #add to list of stems 109 push(@keylist,$_); 75 110 } 76 111 close(IN); 77 112 78 #put data into appropriate format 79 my $keylistref = join(", ", @keylist); 80 my $stemlistref = join(", ", @stemlist); 81 82 # delete doc.extension so that in future it will not be opened and read 83 unlink($inputfile); 113 $keylist = join(", ", @keylist); 84 114 85 # return keywords + stems to basplug 86 return ($keylistref, $stemlistref); 115 # Delete doc.key so that in future it will not be opened and read. 116 # Otherwise KEA sees it as athor keyphrases! 117 118 `rm $gsdlhome/tmp/doc.key`; 119 120 return $keylist; 87 121 } 88 122 89 90 91 123 1; 92 93 -
trunk/gsdl/perllib/plugins/BasPlug.pm
r8789 r8814 96 96 'desc' => "{BasPlug.markup_acronyms}", 97 97 'type' => "flag", 98 'reqd' => "no" }, 98 'reqd' => "no" }, 99 99 { 'name' => "extract_keyphrases", 100 100 'desc' => "{BasPlug.extract_keyphrases}", 101 101 'type' => "flag", 102 'reqd' => "no" }, 102 'reqd' => "no" }, 103 103 { 'name' => "extract_keyphrase_options", 104 104 'desc' => "{BasPlug.extract_keyphrase_options}", 105 105 'type' => "string", 106 106 'deft' => "", 107 'reqd' => "no" }, 107 'reqd' => "no" }, 108 108 { 'name' => "first", 109 109 'desc' => "{BasPlug.first}", … … 872 872 873 873 874 #adding kea keyphrases 874 # adding kea keyphrases 875 875 876 if ($self->{'kea'}) { 876 877 … … 879 880 my @list; 880 881 881 while (defined $thissection) { #loop through sections to gather whole doc 882 #loop through sections to gather whole doc 883 while (defined $thissection) { 882 884 my $sectiontext = $doc_obj->get_text($thissection); 883 885 $text = $text.$sectiontext; … … 885 887 } 886 888 887 #if kea options flag is set, call Kea with specified options 889 888 890 if($self->{'kea_options'}) { 889 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'}); 890 } 891 #otherwise call Kea with no options 892 else { 893 @list = &Kea::extract_KeyPhrases ($text); 894 } 895 896 if(@list){ #if a list of kea keyphrases was returned (ie not empty) 897 my $keyphrases = $list[0]; #first arg is keyphrase list 898 my $stems = $list[1]; #second arg is stemmed keyphrase list 899 &gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n"); 900 # print STDERR "keyphrases: $keyphrases\n"; 901 &gsprintf(STDERR, "{BasPlug.stems}: $stems\n"); 902 # print STDERR "stems: $stems\n"; 903 $thissection = $doc_obj->get_top_section(); #add metadata to top section 891 #if kea options flag is set, call Kea with specified options 892 $list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'}); 893 } else { 894 #otherwise call Kea with no options 895 $list = &Kea::extract_KeyPhrases ($text); 896 } 897 if($list){ 898 # if a list of kea keyphrases was returned (ie not empty) 899 &gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n"); 900 901 #add metadata to top section 902 $thissection = $doc_obj->get_top_section(); 903 904 904 # add all key phrases as one metadata 905 $doc_obj->add_metadata($thissection, "Keyphrases", $keyphrases); 905 $doc_obj->add_metadata($thissection, "Keyphrases", $list); 906 906 907 # add individual key phrases as multiple metadata 907 foreach my $keyphrase (split(',', $ keyphrases)) {908 $keyphrase =~ s/^\s *//; $keyphrase =~ s/\s*$//;908 foreach my $keyphrase (split(',', $list)) { 909 $keyphrase =~ s/^\s+|\s+$//g; 909 910 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase); 910 911 } 911 $doc_obj->add_metadata($thissection, "stems", $stems); 912 } 913 } #end of kea 912 } 913 } 914 915 #end of kea 914 916 915 917 if ($self->{'first'}) { -
trunk/gsdl/perllib/strings.rb
r8796 r8814 529 529 BasPlug.extract_keyphrases:Extract keyphrases automatically with Kea (default settings). 530 530 531 BasPlug.extract_keyphrase_options:Options for keyphrase extraction with Kea. For example: L2 - length of extracted keyphrases is 2 terms, N5 - 5 keyphrases to extract.531 BasPlug.extract_keyphrase_options:Options for keyphrase extraction with Kea. For example: mALIWEB - use ALIWEB extraction model; n5 - extract 5 keyphrase;, eGBK - use GBK encoding. 532 532 533 533 BasPlug.extracting_emails:extracting e-mail addresses
Note:
See TracChangeset
for help on using the changeset viewer.