Changeset 1954 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2001-02-13T10:58:26+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r1903 r1954 25 25 26 26 package BasPlug; 27 27 use Kea; 28 28 use parsargv; 29 29 use multiread; … … 130 130 $self->{'outhandle'} = STDERR; 131 131 my $year = (localtime)[5]+1900; 132 132 133 133 134 # general options available to all plugins 134 135 if (!parsargv::parse(\@_, 135 136 q^process_exp/.*/^, \$self->{'process_exp'}, 136 137 q^block_exp/.*/^, \$self->{'block_exp'}, 138 q^extract_acronyms^, \$self->{'extract_acronyms'}, 139 q^extract_keyphrases^, \$self->{'kea'}, #with extra options 140 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options 137 141 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 138 142 qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'}, 139 q^extract_acronyms^, \$self->{'extract_acronyms'},140 143 q^extract_email^, \$self->{'extract_email'}, 141 144 q^markup_acronyms^, \$self->{'markup_acronyms'}, 142 q^extract_language^, \$self->{'extract_language'},143 145 q^default_language/.{2}/en^, \$self->{'default_language'}, 144 146 q^first/.*/^, \$self->{'first'}, … … 233 235 234 236 sub read { 235 my $self = shift (@_); 237 my $self = shift (@_); 238 236 239 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 237 240 … … 283 286 return 0; 284 287 } 285 288 286 289 # include any metadata passed in from previous plugins 287 290 # note that this metadata is associated with the top level section … … 290 293 # do plugin specific processing of doc_obj 291 294 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); 292 295 293 296 # do any automatic metadata extraction 294 297 $self->auto_extract_metadata ($doc_obj); 295 298 296 299 # add an OID 297 300 $doc_obj->set_OID(); … … 501 504 502 505 # extract metadata 503 sub auto_extract_metadata { 506 sub auto_extract_metadata { 507 508 504 509 my $self = shift (@_); 505 510 my ($doc_obj) = @_; … … 512 517 $thissection = $doc_obj->get_next_section ($thissection); 513 518 } 514 } 519 } 520 521 522 #adding kea keyphrases 523 if ($self->{'kea'}) { 524 525 my $thissection = $doc_obj->get_top_section(); 526 my $text = ""; 527 my @list; 528 529 while (defined $thissection) { #loop through sections to gather whole doc 530 my $sectiontext = $doc_obj->get_text($thissection); 531 $text = $text.$sectiontext; 532 $thissection = $doc_obj->get_next_section ($thissection); 533 } 534 535 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options 536 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'}); 537 } else { #otherwise call Kea with no options 538 @list = &Kea::extract_KeyPhrases ($text); 539 } 540 541 if(@list){ #if a list of kea keyphrases was returned (ie not empty) 542 my $keyphrases = $list[0]; #first arg is keyphrase list 543 my $stems = $list[1]; #second arg is stemmed keyphrase list 544 print STDERR "keyphrases: $keyphrases\n"; 545 print STDERR "stems: $stems\n"; 546 $thissection = $doc_obj->get_top_section(); #add metadata to top section 547 $doc_obj->add_metadata($thissection, "kea", $keyphrases); 548 $doc_obj->add_metadata($thissection, "stems", $stems); 549 } 550 } #end of kea 551 515 552 if ($self->{'first'}) { 516 553 my $thissection = $doc_obj->get_top_section();
Note:
See TracChangeset
for help on using the changeset viewer.