Changeset 10218 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2005-07-06T15:27:45+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r10155 r10218 53 53 use gsprintf 'gsprintf'; 54 54 use printusage; 55 #$%^ 56 use parse2; 57 55 58 56 59 use GISBasPlug; … … 59 62 60 63 my $unicode_list = 61 [ { 'name' => "auto", 62 'desc' => "{BasPlug.input_encoding.auto}" }, 63 { 'name' => "ascii", 64 [ { 'name' => "ascii", 64 65 'desc' => "{BasPlug.input_encoding.ascii}" }, 65 66 { 'name' => "utf8", … … 68 69 'desc' => "{BasPlug.input_encoding.unicode}" } ]; 69 70 71 my $auto_unicode_list = 72 [ { 'name' => "auto", 73 'desc' => "{BasPlug.input_encoding.auto}" } ]; 74 70 75 my $arguments = 71 76 [ { 'name' => "process_exp", … … 90 95 'desc' => "{BasPlug.input_encoding}", 91 96 'type' => "enum", 92 'list' => $ unicode_list,97 'list' => $auto_unicode_list, 93 98 'reqd' => "no" , 94 99 'deft' => "auto" } , … … 107 112 'type' => "language", 108 113 'deft' => "en", 114 'char_length' => "2", 109 115 'reqd' => "no" }, 110 116 { 'name' => "extract_acronyms", … … 141 147 'type' => "int", 142 148 'deft' => (localtime)[5]+1900, 149 'char_length' => "4", 150 #'range' => "2,100", 143 151 'reqd' => "no"}, 144 152 { 'name' => "maximum_century", 145 153 'desc' => "{BasPlug.maximum_century}", 146 154 'type' => "string", 147 'deft' => " ",155 'deft' => "-1", 148 156 'reqd' => "no" }, 149 157 { 'name' => "no_bibliography", … … 154 162 'desc' => "{BasPlug.no_cover_image}", 155 163 'type' => "flag", 156 'reqd' => "no" } ]; 164 'reqd' => "no" }, 165 { 'name' => "extract_keyphrases", 166 'desc' => "{BasPlug.extract_keyphrases}", 167 'type' => "flag", 168 'reqd' => "no", 169 'hiddengli' => "yes" }, 170 { 'name' => "extract_keyphrase_options", 171 'desc' => "{BasPlug.extract_keyphrase_options}", 172 'type' => "string", 173 'reqd' => "no", 174 'hiddengli' => "yes" }, 175 { 'name' => "separate_cjk", 176 'desc' => "{BasPlug.separate_cjk}", 177 'type' => "flag", 178 'reqd' => "no", 179 'hiddengli' => "yes" }, 180 { 'name' => "smart_block", 181 'desc' => "{BasPlug.smart_block}", 182 'type' => "flag", 183 'reqd' => "no", 184 'hiddengli' => "yes" }, 185 { 'name' => "new_extract_email", 186 'desc' => "", 187 'type' => "flag", 188 'reqd' => "no", 189 'hiddengli' => "yes" } ]; 157 190 158 191 my $gis_arguments = … … 238 271 { 239 272 my $self = shift(@_); 240 241 273 # Print the usage message for a plugin (recursively) 242 274 my $descoffset = $self->determine_description_offset(0); … … 317 349 318 350 sub new { 351 # Set Encodings to the list!! 352 353 my $e = $encodings::encodings; 354 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) 355 { 356 my $hashEncode = 357 {'name' => $enc, 358 'desc' => $e->{$enc}->{'name'}}; 359 360 push(@{$unicode_list},$hashEncode); 361 } 362 363 push(@{$auto_unicode_list},@{$unicode_list}); 364 365 # Start the BasPlug Constructor 319 366 my $class = shift (@_); 320 my $plugin_name = shift (@_); 321 my $self = {}; 322 $self->{'plugin_type'} = "BasPlug"; 367 my ($pluginlist,$args,$hashArgOptLists) = @_; 368 push(@$pluginlist, $class); 369 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class; 370 371 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} 372 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 323 373 324 374 if (GISBasPlug::has_mapdata()) { 325 375 push(@$arguments,@$gis_arguments); 326 376 } 327 328 my $enc = "^("; 329 map {$enc .= "$_|";} keys %$encodings::encodings; 330 my $denc = $enc . "ascii|utf8|unicode)\$"; 331 $enc .= "ascii|utf8|unicode|auto)\$"; 332 377 378 my $self = {}; 379 if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self)) 380 { 381 my $classTempClass = bless $self, $class; 382 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name); 383 $classTempClass->print_txt_usage(""); # Use default resource bundle 384 die "\n"; 385 } 386 387 # else parsing was successful. 388 389 $self->{'plugin_type'} = $plugin_name; 333 390 $self->{'outhandle'} = STDERR; 334 my $year = (localtime)[5]+1900;335 336 391 $self->{'textcat'} = new textcat(); 337 338 392 $self->{'num_processed'} = 0; 339 393 $self->{'num_not_processed'} = 0; … … 341 395 $self->{'num_archives'} = 0; 342 396 $self->{'cover_image'} = 1; # cover image is on by default 343 344 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 345 $self->{'option_list'} = [ $options ]; 346 347 my $no_cover_image = 0; 348 # general options available to all plugins 349 if (!parsargv::parse(\@_, 350 q^process_exp/.*/^, \$self->{'process_exp'}, 351 q^block_exp/.*/^, \$self->{'block_exp'}, 352 q^associate_ext/.*/^, \$self->{'associate_ext'}, 353 q^extract_language^, \$self->{'extract_language'}, 354 q^extract_acronyms^, \$self->{'extract_acronyms'}, 355 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED) 356 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED) 357 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 358 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'}, 359 q^extract_email^, \$self->{'extract_email'}, 360 q^extract_placenames^, \$self->{'extract_placenames'}, 361 q^gazetteer/.*/^, \$self->{'gazetteer'}, 362 q^place_list^, \$self->{'place_list'}, 363 q^markup_acronyms^, \$self->{'markup_acronyms'}, 364 q^default_language/.{2}/en^, \$self->{'default_language'}, 365 q^first/.*/^, \$self->{'first'}, 366 q^extract_historical_years^, \$self->{'date_extract'}, 367 qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'}, 368 q^no_bibliography^, \$self->{'no_biblio'}, 369 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'}, 370 q^no_cover_image^, \$no_cover_image, 371 q^separate_cjk^, \$self->{'separate_cjk'}, 372 q^smart_block^, \$self->{'smart_block'}, 373 q^smart_block_BN^, \$self->{'smart_block_BN'}, 374 "allow_extra_options")) { 375 376 gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name); 377 bless $self, $class; 378 $self->print_txt_usage(""); # Use default resource bundle 379 die "\n"; 380 } 381 397 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'}); 398 $self->{'file_blocks'} = {}; 399 $self->{'option_list'} = $hashArgOptLists->{"OptList"}; 400 382 401 my $associate_ext = $self->{'associate_ext'}; 383 402 if ((defined $associate_ext) && ($associate_ext ne "")) { … … 395 414 $self->{'file_blocks'} = {}; 396 415 397 $self->{'cover_image'} = 0 if ($no_cover_image);398 399 416 if ($self->{'extract_placenames'}) { 400 417 401 418 my $outhandle = $self->{'outhandle'}; 402 419 403 420 my $places_ref 404 421 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'}); 405 422 406 423 if (!defined $places_ref) { 407 424 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n"; … … 414 431 } 415 432 return bless $self, $class; 433 416 434 } 417 435 … … 1121 1139 @email = sort @email; 1122 1140 1123 my @email2 = (); 1141 # if($self->{"new_extract_email"} == 0) 1142 # { 1143 # my @email2 = (); 1144 # foreach my $address (@email) 1145 # { 1146 # if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ )) 1147 # { 1148 # push @email2, $address; 1149 # $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address); 1150 # # print $outhandle " extracting $address\n" 1151 # &gsprintf($outhandle, " {BasPlug.extracting} $address\n") 1152 # if ($self->{'verbosity'} > 3); 1153 # } 1154 # } 1155 # } 1156 # else 1157 # { 1158 my $hashExistMail = {}; 1124 1159 foreach my $address (@email) { 1125 if (!(join(" ",@email2) =~ m/$address/ )) { 1126 push @email2, $address; 1160 if (!(defined $hashExistMail->{$address})) 1161 { 1162 $hashExistMail->{$address} = 1; 1127 1163 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address); 1128 1164 gsprintf($outhandle, " {BasPlug.extracting} $address\n") … … 1157 1193 } 1158 1194 1159 # 1160 if ($self->{' kea'}) {1195 #adding kea keyphrases 1196 if ($self->{'extract_keyphrases'}) { 1161 1197 1162 1198 my $thissection = $doc_obj->get_top_section(); … … 1171 1207 } 1172 1208 1173 if ($self->{'kea_options'}) { 1174 #if kea options flag is set, call Kea with specified options 1175 $list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'}); 1176 } else { 1177 #otherwise call Kea with no options 1209 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options 1210 $list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'}); 1211 } else { #otherwise call Kea with no options 1178 1212 $list = &Kea::extract_KeyPhrases ($text); 1179 1213 } 1214 1180 1215 if ($list){ 1181 1216 # if a list of kea keyphrases was returned (ie not empty) … … 1227 1262 } 1228 1263 1229 if($self->{' date_extract'}) {1264 if($self->{'extract_historical_years'}) { 1230 1265 my $thissection = $doc_obj->get_top_section(); 1231 1266 while (defined $thissection) { 1232 1267 1233 1268 my $text = $doc_obj->get_text($thissection); 1234 1269 &DateExtract::get_date_metadata($text, $doc_obj, 1235 1270 $thissection, 1236 $self->{'no_biblio '},1237 $self->{'max _year'},1238 $self->{'max _century'});1271 $self->{'no_bibliography'}, 1272 $self->{'maximum_year'}, 1273 $self->{'maximum_century'}); 1239 1274 $thissection = $doc_obj->get_next_section ($thissection); 1240 1275 }
Note:
See TracChangeset
for help on using the changeset viewer.