Changeset 11069


Ignore:
Timestamp:
2006-01-19T16:00:06+13:00 (18 years ago)
Author:
mdewsnip
Message:

Added an option to use Kea 4.0 -- this isn't included with Greenstone, but it will tell you where to get it.

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/Kea.pm

    r9409 r11069  
    1414# appropriate format.
    1515
     16sub get_Kea_directory
     17{
     18    my $kea_version = shift(@_);
     19    return &util::filename_cat($ENV{'GSDLHOME'}, "packages", "kea", "kea-$kea_version");
     20}
     21
    1622# returns a string containing comma-separated keyphrases
    1723sub extract_KeyPhrases {
    1824
    1925    # Parsing arguments of the function
     26    my $kea_version = shift(@_);
    2027    my $doc = shift(@_); # documents text 
    2128    my $args = shift(@_); # any options
     
    2330
    2431    # Specifying directory names
    25     my $gsdlhome = $ENV{'GSDLHOME'};
    26     my $keahome = "$gsdlhome/packages/kea/kea-3.0";
    27     my $defaultmodel = "$keahome/CSTR";
    28    
     32    my $keahome = &get_Kea_directory($kea_version);
     33    my $defaultmodel = &util::filename_cat($keahome, "CSTR-20");
     34    if ($kea_version eq "4.0") {
     35    # Use a different model for Kea 4.0
     36    $defaultmodel = &util::filename_cat($keahome, "FAO-20docs");
     37    }
     38
    2939    # Initializing variables:
    3040    my $command = "";
     
    91101
    92102    # Write text to a temporary file doc.txt
     103    my $gsdlhome = $ENV{'GSDLHOME'};
    93104    open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; 
    94105    print OUT $doc;
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r11044 r11069  
    135135      { 'name' => "extract_keyphrases",
    136136    'desc' => "{BasPlug.extract_keyphrases}",
     137    'type' => "flag",
     138    'reqd' => "no" },
     139      { 'name' => "extract_keyphrases_kea4",
     140    'desc' => "{BasPlug.extract_keyphrases_kea4}",
    137141    'type' => "flag",
    138142    'reqd' => "no" },
     
    12891293    }
    12901294
    1291     #adding kea keyphrases
    1292     if ($self->{'extract_keyphrases'}) { 
    1293    
    1294     my $thissection = $doc_obj->get_top_section();
    1295     my $text = "";
    1296     my $list;
    1297 
    1298     #loop through sections to gather whole doc
    1299     while (defined $thissection) {
    1300         my $sectiontext = $doc_obj->get_text($thissection);   
    1301         $text = $text.$sectiontext;
    1302         $thissection = $doc_obj->get_next_section ($thissection);
    1303     }
    1304        
    1305     if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
    1306         $list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'});
    1307     } else { #otherwise call Kea with no options
    1308         $list = &Kea::extract_KeyPhrases ($text);
    1309     }
    1310      
    1311     if ($list){
    1312         # if a list of kea keyphrases was returned (ie not empty)
    1313         if ($self->{'verbosity'}) {
    1314         gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
    1315         }
    1316 
    1317         #add metadata to top section
    1318         $thissection = $doc_obj->get_top_section();
    1319 
    1320         # add all key phrases as one metadata
    1321         $doc_obj->add_metadata($thissection, "Keyphrases", $list);
    1322 
    1323         # add individual key phrases as multiple metadata
    1324         foreach my $keyphrase (split(',', $list)) {
    1325             $keyphrase =~ s/^\s+|\s+$//g;
    1326         $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
    1327         }
    1328     }
    1329     } #end of kea
     1295    if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
     1296    $self->extract_keyphrases($doc_obj);
     1297    }
    13301298
    13311299    if ($self->{'first'}) {
     
    13721340    }
    13731341}
     1342
     1343
     1344#adding kea keyphrases
     1345sub extract_keyphrases
     1346{
     1347    my $self = shift(@_);
     1348    my $doc_obj = shift(@_);
     1349
     1350    # Use Kea 3.0 unless 4.0 has been specified
     1351    my $kea_version = "3.0";
     1352    if ($self->{'extract_keyphrases_kea4'}) {
     1353    $kea_version = "4.0";
     1354    }
     1355
     1356    # Check that Kea exists, and tell the user where to get it if not
     1357    my $keahome = &Kea::get_Kea_directory($kea_version);
     1358    if (!-e $keahome) {
     1359    gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
     1360    return;
     1361    }
     1362
     1363    my $thissection = $doc_obj->get_top_section();
     1364    my $text = "";
     1365    my $list;
     1366
     1367    #loop through sections to gather whole doc
     1368    while (defined $thissection) {
     1369    my $sectiontext = $doc_obj->get_text($thissection);   
     1370    $text = $text.$sectiontext;
     1371    $thissection = $doc_obj->get_next_section ($thissection);
     1372    }
     1373   
     1374    if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
     1375    $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
     1376    } else { #otherwise call Kea with no options
     1377    $list = &Kea::extract_KeyPhrases ($kea_version, $text);
     1378    }
     1379 
     1380    if ($list){
     1381    # if a list of kea keyphrases was returned (ie not empty)
     1382    if ($self->{'verbosity'}) {
     1383        gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
     1384    }
     1385
     1386    #add metadata to top section
     1387    $thissection = $doc_obj->get_top_section();
     1388
     1389    # add all key phrases as one metadata
     1390    $doc_obj->add_metadata($thissection, "Keyphrases", $list);
     1391
     1392    # add individual key phrases as multiple metadata
     1393    foreach my $keyphrase (split(',', $list)) {
     1394        $keyphrase =~ s/^\s+|\s+$//g;
     1395        $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
     1396    }
     1397    }
     1398}
     1399
    13741400
    13751401# extract acronyms from a section in a document. progress is
  • trunk/gsdl/perllib/strings.rb

    r11008 r11069  
    571571BasPlug.extract_keyphrases:Extract keyphrases automatically with Kea (default settings).
    572572
     573BasPlug.extract_keyphrases_kea4:Extract keyphrases automatically with Kea 4.0 (default settings). Kea 4.0 is a new version of Kea that has been developed for controlled indexing of documents in the domain of agriculture.
     574
    573575BasPlug.extract_keyphrase_options:Options for keyphrase extraction with Kea. For example: mALIWEB - use ALIWEB extraction model; n5 - extract 5 keyphrase;, eGBK - use GBK encoding.
    574576
     
    596598
    597599BasPlug.maximum_year:The maximum historical date to be used as metadata (in a Common Era date, such as 1950).
     600
     601BasPlug.missing_kea:Error: The Kea software could not be found at %s. Please download Kea %s from http://www.nzdl.org/Kea and install it in this directory.
    598602
    599603BasPlug.must_be_implemented:BasPlug::read function must be implemented in sub-class for recursive plugins
Note: See TracChangeset for help on using the changeset viewer.