Ignore:
Timestamp:
2005-07-06T15:27:45+12:00 (19 years ago)
Author:
kjdon
Message:

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r10155 r10218  
    5353use gsprintf 'gsprintf';
    5454use printusage;
     55#$%^
     56use parse2;
     57
    5558
    5659use GISBasPlug;
     
    5962
    6063my $unicode_list =
    61     [ { 'name' => "auto",
    62     'desc' => "{BasPlug.input_encoding.auto}" },
    63       { 'name' => "ascii",
     64    [ { 'name' => "ascii",
    6465    'desc' => "{BasPlug.input_encoding.ascii}" },
    6566      { 'name' => "utf8",
     
    6869    'desc' => "{BasPlug.input_encoding.unicode}" } ];
    6970
     71my $auto_unicode_list =
     72    [ { 'name' => "auto",
     73    'desc' => "{BasPlug.input_encoding.auto}" } ];
     74
    7075my $arguments =
    7176    [ { 'name' => "process_exp",
     
    9095    'desc' => "{BasPlug.input_encoding}",
    9196    'type' => "enum",
    92     'list' => $unicode_list,
     97    'list' => $auto_unicode_list,
    9398    'reqd' => "no" ,
    9499    'deft' => "auto" } ,
     
    107112    'type' => "language",
    108113    'deft' => "en",
     114    'char_length' => "2",
    109115    'reqd' => "no" },
    110116      { 'name' => "extract_acronyms",
     
    141147    'type' => "int",
    142148    'deft' => (localtime)[5]+1900,
     149    'char_length' => "4",
     150    #'range' => "2,100",
    143151    'reqd' => "no"},
    144152      { 'name' => "maximum_century",
    145153    'desc' => "{BasPlug.maximum_century}",
    146154    'type' => "string",
    147     'deft' => "",
     155    'deft' => "-1",
    148156    'reqd' => "no" },
    149157      { 'name' => "no_bibliography",
     
    154162    'desc' => "{BasPlug.no_cover_image}",
    155163    'type' => "flag",
    156     'reqd' => "no" } ];
     164    'reqd' => "no" },
     165      { 'name' => "extract_keyphrases",
     166    'desc' => "{BasPlug.extract_keyphrases}",
     167    'type' => "flag",
     168    'reqd' => "no",
     169    'hiddengli' => "yes" },
     170      { 'name' => "extract_keyphrase_options",
     171    'desc' => "{BasPlug.extract_keyphrase_options}",
     172    'type' => "string",
     173    'reqd' => "no",
     174    'hiddengli' => "yes" },
     175      { 'name' => "separate_cjk",
     176    'desc' => "{BasPlug.separate_cjk}",
     177    'type' => "flag",
     178    'reqd' => "no",
     179    'hiddengli' => "yes" },
     180      { 'name' => "smart_block",
     181    'desc' => "{BasPlug.smart_block}",
     182    'type' => "flag",
     183    'reqd' => "no",
     184    'hiddengli' => "yes" },
     185      { 'name' => "new_extract_email",
     186    'desc' => "",
     187    'type' => "flag",
     188    'reqd' => "no",
     189    'hiddengli' => "yes" } ];
    157190
    158191my $gis_arguments =
     
    238271{
    239272    my $self = shift(@_);
    240 
    241273    # Print the usage message for a plugin (recursively)
    242274    my $descoffset = $self->determine_description_offset(0);
     
    317349
    318350sub new {
     351    # Set Encodings to the list!!
     352
     353    my $e = $encodings::encodings;
     354    foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
     355    {
     356    my $hashEncode =
     357        {'name' => $enc,
     358         'desc' => $e->{$enc}->{'name'}};
     359
     360    push(@{$unicode_list},$hashEncode);
     361    }
     362
     363    push(@{$auto_unicode_list},@{$unicode_list});
     364
     365    # Start the BasPlug Constructor
    319366    my $class = shift (@_);
    320     my $plugin_name = shift (@_);
    321     my $self = {};
    322     $self->{'plugin_type'} = "BasPlug";
     367    my ($pluginlist,$args,$hashArgOptLists) = @_;
     368    push(@$pluginlist, $class);
     369    my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
     370
     371    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
     372    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    323373
    324374    if (GISBasPlug::has_mapdata()) {
    325375    push(@$arguments,@$gis_arguments);
    326376    }
    327 
    328     my $enc = "^(";
    329     map {$enc .= "$_|";} keys %$encodings::encodings;
    330     my $denc = $enc . "ascii|utf8|unicode)\$";
    331     $enc .= "ascii|utf8|unicode|auto)\$";
    332    
     377   
     378    my $self = {};
     379    if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
     380    {
     381    my $classTempClass = bless $self, $class;
     382    &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
     383    $classTempClass->print_txt_usage("");  # Use default resource bundle
     384    die "\n";
     385    }
     386
     387    # else parsing was successful.
     388
     389    $self->{'plugin_type'} = $plugin_name;
    333390    $self->{'outhandle'} = STDERR;
    334     my $year = (localtime)[5]+1900;
    335 
    336391    $self->{'textcat'} = new textcat();
    337 
    338392    $self->{'num_processed'} = 0;
    339393    $self->{'num_not_processed'} = 0;
     
    341395    $self->{'num_archives'} = 0;
    342396    $self->{'cover_image'} = 1; # cover image is on by default
    343 
    344     # 14-05-02 To allow for proper inheritance of arguments - John Thompson
    345     $self->{'option_list'} = [ $options ];
    346    
    347     my $no_cover_image = 0;
    348     # general options available to all plugins
    349     if (!parsargv::parse(\@_,
    350              q^process_exp/.*/^, \$self->{'process_exp'},
    351              q^block_exp/.*/^, \$self->{'block_exp'},
    352              q^associate_ext/.*/^, \$self->{'associate_ext'},
    353              q^extract_language^, \$self->{'extract_language'},
    354              q^extract_acronyms^, \$self->{'extract_acronyms'},
    355              q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
    356              q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
    357              qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
    358              qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
    359              q^extract_email^, \$self->{'extract_email'},
    360              q^extract_placenames^, \$self->{'extract_placenames'},
    361              q^gazetteer/.*/^, \$self->{'gazetteer'},
    362              q^place_list^, \$self->{'place_list'},
    363              q^markup_acronyms^, \$self->{'markup_acronyms'},
    364              q^default_language/.{2}/en^, \$self->{'default_language'},
    365              q^first/.*/^, \$self->{'first'},
    366              q^extract_historical_years^, \$self->{'date_extract'},
    367              qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
    368              q^no_bibliography^, \$self->{'no_biblio'},
    369              qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
    370              q^no_cover_image^, \$no_cover_image,
    371              q^separate_cjk^, \$self->{'separate_cjk'},
    372              q^smart_block^, \$self->{'smart_block'},
    373              q^smart_block_BN^, \$self->{'smart_block_BN'},
    374              "allow_extra_options")) {
    375 
    376     gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
    377         bless $self, $class;
    378     $self->print_txt_usage("");  # Use default resource bundle
    379     die "\n";
    380     }
    381 
     397    $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
     398    $self->{'file_blocks'} = {};
     399    $self->{'option_list'} = $hashArgOptLists->{"OptList"};
     400   
    382401    my $associate_ext = $self->{'associate_ext'};
    383402    if ((defined $associate_ext) && ($associate_ext ne "")) {
     
    395414    $self->{'file_blocks'} = {};
    396415
    397     $self->{'cover_image'} = 0 if ($no_cover_image);
    398 
    399416    if ($self->{'extract_placenames'}) {
    400417
    401418    my $outhandle = $self->{'outhandle'};
    402 
     419   
    403420    my $places_ref
    404421        = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
    405 
     422   
    406423    if (!defined $places_ref) {
    407424        print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
     
    414431    }   
    415432    return bless $self, $class;
     433   
    416434}
    417435
     
    11211139    @email = sort @email;
    11221140   
    1123     my @email2 = ();
     1141#    if($self->{"new_extract_email"} == 0)
     1142#    {
     1143#    my @email2 = ();
     1144#    foreach my $address (@email)
     1145#   {
     1146#   if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
     1147#       {
     1148#       push @email2, $address;
     1149#       $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
     1150#       # print $outhandle "  extracting $address\n"
     1151#       &gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
     1152#           if ($self->{'verbosity'} > 3);
     1153#       }
     1154#   }
     1155#    }
     1156#    else
     1157#    {
     1158    my $hashExistMail = {};
    11241159    foreach my $address (@email) {
    1125     if (!(join(" ",@email2) =~ m/$address/ )) {
    1126         push @email2, $address;
     1160    if (!(defined $hashExistMail->{$address}))
     1161    {
     1162        $hashExistMail->{$address} = 1;
    11271163        $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
    11281164        gsprintf($outhandle, "  {BasPlug.extracting} $address\n")
     
    11571193    }
    11581194
    1159     # adding kea keyphrases
    1160     if ($self->{'kea'}) {
     1195    #adding kea keyphrases
     1196    if ($self->{'extract_keyphrases'}) { 
    11611197   
    11621198    my $thissection = $doc_obj->get_top_section();
     
    11711207    }
    11721208       
    1173     if ($self->{'kea_options'}) {
    1174         #if kea options flag is set, call Kea with specified options
    1175         $list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
    1176     } else {
    1177         #otherwise call Kea with no options
     1209    if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
     1210        $list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'});
     1211    } else { #otherwise call Kea with no options
    11781212        $list = &Kea::extract_KeyPhrases ($text);
    11791213    }
     1214     
    11801215    if ($list){
    11811216        # if a list of kea keyphrases was returned (ie not empty)
     
    12271262    }
    12281263
    1229     if($self->{'date_extract'}) {
     1264    if($self->{'extract_historical_years'}) {
    12301265    my $thissection = $doc_obj->get_top_section();
    12311266    while (defined $thissection) {
    1232        
     1267
    12331268        my $text = $doc_obj->get_text($thissection);
    12341269        &DateExtract::get_date_metadata($text, $doc_obj,
    12351270                        $thissection,
    1236                         $self->{'no_biblio'},
    1237                         $self->{'max_year'},
    1238                         $self->{'max_century'});
     1271                        $self->{'no_bibliography'},
     1272                        $self->{'maximum_year'},
     1273                        $self->{'maximum_century'});
    12391274        $thissection = $doc_obj->get_next_section ($thissection);
    12401275    }
Note: See TracChangeset for help on using the changeset viewer.