Ignore:
Timestamp:
2001-02-13T10:58:26+13:00 (23 years ago)
Author:
jmt14
Message:

* empty log message *

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1903 r1954  
    2525
    2626package BasPlug;
    27 
     27use Kea;
    2828use parsargv;
    2929use multiread;
     
    130130    $self->{'outhandle'} = STDERR;
    131131    my $year = (localtime)[5]+1900;
    132 
     132   
     133 
    133134    # general options available to all plugins
    134135    if (!parsargv::parse(\@_,
    135136             q^process_exp/.*/^, \$self->{'process_exp'},
    136137             q^block_exp/.*/^, \$self->{'block_exp'},
     138             q^extract_acronyms^, \$self->{'extract_acronyms'},
     139             q^extract_keyphrases^, \$self->{'kea'}, #with extra options
     140             q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options
    137141             qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
    138142             qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
    139              q^extract_acronyms^, \$self->{'extract_acronyms'},
    140143             q^extract_email^, \$self->{'extract_email'},
    141144             q^markup_acronyms^, \$self->{'markup_acronyms'},
    142              q^extract_language^, \$self->{'extract_language'},
    143145             q^default_language/.{2}/en^, \$self->{'default_language'},
    144146             q^first/.*/^, \$self->{'first'},
     
    233235
    234236sub read {
    235     my $self = shift (@_);
     237    my $self = shift (@_); 
     238 
    236239    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    237240
     
    283286    return 0;
    284287    }
    285 
     288   
    286289    # include any metadata passed in from previous plugins
    287290    # note that this metadata is associated with the top level section
     
    290293    # do plugin specific processing of doc_obj
    291294    return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
    292 
     295   
    293296    # do any automatic metadata extraction
    294297    $self->auto_extract_metadata ($doc_obj);
    295 
     298   
    296299    # add an OID
    297300    $doc_obj->set_OID();
     
    501504
    502505# extract metadata
    503 sub auto_extract_metadata {
     506sub auto_extract_metadata {
     507 
     508
    504509    my $self = shift (@_);
    505510    my ($doc_obj) = @_;
     
    512517        $thissection = $doc_obj->get_next_section ($thissection);
    513518    }
    514     }   
     519    }
     520
     521
     522#adding kea keyphrases
     523    if ($self->{'kea'}) { 
     524   
     525    my $thissection = $doc_obj->get_top_section();
     526    my $text = "";
     527    my @list;
     528
     529    while (defined $thissection) { #loop through sections to gather whole doc
     530        my $sectiontext = $doc_obj->get_text($thissection);   
     531        $text = $text.$sectiontext;
     532        $thissection = $doc_obj->get_next_section ($thissection);
     533    }
     534       
     535    if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
     536        @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
     537    } else { #otherwise call Kea with no options
     538        @list = &Kea::extract_KeyPhrases ($text);
     539    }
     540     
     541    if(@list){ #if a list of kea keyphrases was returned (ie not empty)
     542        my $keyphrases = $list[0]; #first arg is keyphrase list
     543        my $stems = $list[1]; #second  arg is stemmed keyphrase list
     544        print STDERR "keyphrases: $keyphrases\n";
     545        print STDERR "stems: $stems\n";
     546        $thissection = $doc_obj->get_top_section(); #add metadata to top section
     547        $doc_obj->add_metadata($thissection, "kea", $keyphrases);
     548        $doc_obj->add_metadata($thissection, "stems", $stems);
     549    }
     550    } #end of kea
     551
    515552    if ($self->{'first'}) {
    516553    my $thissection = $doc_obj->get_top_section();
Note: See TracChangeset for help on using the changeset viewer.