Changeset 11069
- Timestamp:
- 2006-01-19T16:00:06+13:00 (18 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/Kea.pm
r9409 r11069 14 14 # appropriate format. 15 15 16 sub get_Kea_directory 17 { 18 my $kea_version = shift(@_); 19 return &util::filename_cat($ENV{'GSDLHOME'}, "packages", "kea", "kea-$kea_version"); 20 } 21 16 22 # returns a string containing comma-separated keyphrases 17 23 sub extract_KeyPhrases { 18 24 19 25 # Parsing arguments of the function 26 my $kea_version = shift(@_); 20 27 my $doc = shift(@_); # documents text 21 28 my $args = shift(@_); # any options … … 23 30 24 31 # Specifying directory names 25 my $gsdlhome = $ENV{'GSDLHOME'}; 26 my $keahome = "$gsdlhome/packages/kea/kea-3.0"; 27 my $defaultmodel = "$keahome/CSTR"; 28 32 my $keahome = &get_Kea_directory($kea_version); 33 my $defaultmodel = &util::filename_cat($keahome, "CSTR-20"); 34 if ($kea_version eq "4.0") { 35 # Use a different model for Kea 4.0 36 $defaultmodel = &util::filename_cat($keahome, "FAO-20docs"); 37 } 38 29 39 # Initializing variables: 30 40 my $command = ""; … … 91 101 92 102 # Write text to a temporary file doc.txt 103 my $gsdlhome = $ENV{'GSDLHOME'}; 93 104 open(OUT, ">$gsdlhome/tmp/doc.txt") or die "In Kea.pm doc.txt could not be created\n"; 94 105 print OUT $doc; -
trunk/gsdl/perllib/plugins/BasPlug.pm
r11044 r11069 135 135 { 'name' => "extract_keyphrases", 136 136 'desc' => "{BasPlug.extract_keyphrases}", 137 'type' => "flag", 138 'reqd' => "no" }, 139 { 'name' => "extract_keyphrases_kea4", 140 'desc' => "{BasPlug.extract_keyphrases_kea4}", 137 141 'type' => "flag", 138 142 'reqd' => "no" }, … … 1289 1293 } 1290 1294 1291 #adding kea keyphrases 1292 if ($self->{'extract_keyphrases'}) { 1293 1294 my $thissection = $doc_obj->get_top_section(); 1295 my $text = ""; 1296 my $list; 1297 1298 #loop through sections to gather whole doc 1299 while (defined $thissection) { 1300 my $sectiontext = $doc_obj->get_text($thissection); 1301 $text = $text.$sectiontext; 1302 $thissection = $doc_obj->get_next_section ($thissection); 1303 } 1304 1305 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options 1306 $list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'}); 1307 } else { #otherwise call Kea with no options 1308 $list = &Kea::extract_KeyPhrases ($text); 1309 } 1310 1311 if ($list){ 1312 # if a list of kea keyphrases was returned (ie not empty) 1313 if ($self->{'verbosity'}) { 1314 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n"); 1315 } 1316 1317 #add metadata to top section 1318 $thissection = $doc_obj->get_top_section(); 1319 1320 # add all key phrases as one metadata 1321 $doc_obj->add_metadata($thissection, "Keyphrases", $list); 1322 1323 # add individual key phrases as multiple metadata 1324 foreach my $keyphrase (split(',', $list)) { 1325 $keyphrase =~ s/^\s+|\s+$//g; 1326 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase); 1327 } 1328 } 1329 } #end of kea 1295 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) { 1296 $self->extract_keyphrases($doc_obj); 1297 } 1330 1298 1331 1299 if ($self->{'first'}) { … … 1372 1340 } 1373 1341 } 1342 1343 1344 #adding kea keyphrases 1345 sub extract_keyphrases 1346 { 1347 my $self = shift(@_); 1348 my $doc_obj = shift(@_); 1349 1350 # Use Kea 3.0 unless 4.0 has been specified 1351 my $kea_version = "3.0"; 1352 if ($self->{'extract_keyphrases_kea4'}) { 1353 $kea_version = "4.0"; 1354 } 1355 1356 # Check that Kea exists, and tell the user where to get it if not 1357 my $keahome = &Kea::get_Kea_directory($kea_version); 1358 if (!-e $keahome) { 1359 gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version); 1360 return; 1361 } 1362 1363 my $thissection = $doc_obj->get_top_section(); 1364 my $text = ""; 1365 my $list; 1366 1367 #loop through sections to gather whole doc 1368 while (defined $thissection) { 1369 my $sectiontext = $doc_obj->get_text($thissection); 1370 $text = $text.$sectiontext; 1371 $thissection = $doc_obj->get_next_section ($thissection); 1372 } 1373 1374 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options 1375 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'}); 1376 } else { #otherwise call Kea with no options 1377 $list = &Kea::extract_KeyPhrases ($kea_version, $text); 1378 } 1379 1380 if ($list){ 1381 # if a list of kea keyphrases was returned (ie not empty) 1382 if ($self->{'verbosity'}) { 1383 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n"); 1384 } 1385 1386 #add metadata to top section 1387 $thissection = $doc_obj->get_top_section(); 1388 1389 # add all key phrases as one metadata 1390 $doc_obj->add_metadata($thissection, "Keyphrases", $list); 1391 1392 # add individual key phrases as multiple metadata 1393 foreach my $keyphrase (split(',', $list)) { 1394 $keyphrase =~ s/^\s+|\s+$//g; 1395 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase); 1396 } 1397 } 1398 } 1399 1374 1400 1375 1401 # extract acronyms from a section in a document. progress is -
trunk/gsdl/perllib/strings.rb
r11008 r11069 571 571 BasPlug.extract_keyphrases:Extract keyphrases automatically with Kea (default settings). 572 572 573 BasPlug.extract_keyphrases_kea4:Extract keyphrases automatically with Kea 4.0 (default settings). Kea 4.0 is a new version of Kea that has been developed for controlled indexing of documents in the domain of agriculture. 574 573 575 BasPlug.extract_keyphrase_options:Options for keyphrase extraction with Kea. For example: mALIWEB - use ALIWEB extraction model; n5 - extract 5 keyphrase;, eGBK - use GBK encoding. 574 576 … … 596 598 597 599 BasPlug.maximum_year:The maximum historical date to be used as metadata (in a Common Era date, such as 1950). 600 601 BasPlug.missing_kea:Error: The Kea software could not be found at %s. Please download Kea %s from http://www.nzdl.org/Kea and install it in this directory. 598 602 599 603 BasPlug.must_be_implemented:BasPlug::read function must be implemented in sub-class for recursive plugins
Note:
See TracChangeset
for help on using the changeset viewer.