Changeset 4744 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2003-06-20T14:22:34+12:00 (21 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r3834 r4744 45 45 46 46 my $unicode_list = 47 [ { 'name' => "auto",48 49 { 'name' => "ascii",50 51 { 'name' => "utf8",52 53 { 'name' => "unicode",54 47 [ { 'name' => "auto", 48 'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } , 49 { 'name' => "ascii", 50 'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." }, 51 { 'name' => "utf8", 52 'desc' => "either utf8 or unicode -- automatically detected." }, 53 { 'name' => "unicode", 54 'desc' => "just unicode" } ]; 55 55 56 56 my $arguments = 57 57 [ { 'name' => "process_exp", 58 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e.all documents ending in .htm or .html (case-insensitive).",58 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 59 59 'type' => "string", 60 60 'deft' => "", … … 66 66 'reqd' => "no" }, 67 67 { 'name' => "input_encoding", 68 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.",68 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.", 69 69 'type' => "enum", 70 70 'list' => $unicode_list, … … 72 72 'deft' => "auto" } , 73 73 { 'name' => "default_encoding", 74 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone. The default is iso_8859_1.", 75 'type' => "flag", 76 'reqd' => "no" }, 74 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.", 75 'type' => "enum", 76 'reqd' => "no", 77 'deft' => "utf8" }, 77 78 { 'name' => "extract_language", 78 79 'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.", … … 107 108 'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).", 108 109 'type' => "int", 110 'deft' => (localtime)[5]+1900, 109 111 'reqd' => "no"}, 110 112 { 'name' => "maximum_century", 111 113 'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).", 112 114 'type' => "int", 115 'deft' => "-1", 113 116 'reqd' => "no" }, 114 117 { 'name' => "no_bibliography", … … 196 199 } 197 200 201 202 # sub print_usage_new 203 # { 204 # } 205 206 207 sub print_usage_new 208 { 209 local $self = shift(@_); 210 local $optionlist = $self->{'option_list'}; 211 local $pluginoptions = pop(@$optionlist); 212 return if (!defined($pluginoptions)); 213 214 local $pluginname = $pluginoptions->{'name'}; 215 local $pluginargs = $pluginoptions->{'args'}; 216 217 # Produce the usage information using the data structure above 218 print STDERR " usage: plugin $pluginname"; 219 if (defined($pluginargs)) { 220 print STDERR " [options]"; 221 } 222 print STDERR "\n\n"; 223 224 # Display the plugin options, if there are some 225 if (defined($pluginargs)) { 226 # Find the length of the longest option string 227 local $maxlength = 0; 228 foreach $option (@$pluginargs) { 229 local $optionname = $option->{'name'}; 230 local $optiontype = $option->{'type'}; 231 232 local $optionstringlength = length($optionname); 233 if ($optiontype ne "flag") { 234 $optionstringlength = $optionstringlength + 3 + length($optiontype); 235 } 236 237 # Remember the longest 238 if ($optionstringlength > $maxlength) { 239 $maxlength = $optionstringlength; 240 } 241 } 242 243 # Calculate the column offset of the option descriptions 244 local $optiondescoffset = 3 + $maxlength + 2; 245 246 # Display the plugin options 247 print STDERR " options:\n"; 248 foreach $option (@$pluginargs) { 249 # Display option name 250 local $optionname = $option->{'name'}; 251 print STDERR " -$optionname"; 252 local $optionstringlength = 3 + length($optionname); 253 254 # Display option type, if the option is not a flag 255 local $optiontype = $option->{'type'}; 256 if ($optiontype ne "flag") { 257 print STDERR " <$optiontype>"; 258 $optionstringlength = $optionstringlength + (2 + length($optiontype) + 1); 259 } 260 261 # Display the option description 262 local $optiondesc = $option->{'desc'}; 263 &display_text_in_column($optiondesc, $optiondescoffset, $optionstringlength, 80); 264 265 # Show the default value for the option, if there is one 266 local $optiondefault = $option->{'deft'}; 267 if (defined($optiondefault)) { 268 print STDERR " " x $optiondescoffset; 269 print STDERR "Default: " . $optiondefault . "\n"; 270 } 271 272 # If the option has a list of possible values, display these 273 local $optionvalueslist = $option->{'list'}; 274 if (defined($optionvalueslist)) { 275 print STDERR "\n"; 276 foreach $optionvalue (@$optionvalueslist) { 277 local $optionvaluename = $optionvalue->{'name'}; 278 print STDERR " " x $optiondescoffset; 279 print STDERR "$optionvaluename:"; 280 281 local $optionvaluedesc = $optionvalue->{'desc'}; 282 &display_text_in_column($optionvaluedesc, ($optiondescoffset + 2), 283 $optiondescoffset + length($optionvaluename), 80); 284 } 285 } 286 287 # Special case for 'input_encoding' 288 if ($optionname =~ m/^input_encoding$/i) { 289 my $e = $encodings::encodings; 290 foreach $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) { 291 local $encodingname = $enc; 292 print STDERR " " x $optiondescoffset; 293 print STDERR "$enc:"; 294 295 local $encodingdesc = $e->{$enc}->{'name'}; 296 &display_text_in_column($encodingdesc, ($optiondescoffset + 2), 297 $optiondescoffset + length($encodingname), 80); 298 } 299 } 300 301 # Add a blank line to separate options 302 print STDERR "\n"; 303 } 304 } 305 306 # If the plugin inherits from another, do the parent now 307 if (defined($optionlist)) { 308 $self->print_usage_new(); 309 } 310 } 311 312 313 sub display_text_in_column 314 { 315 local ($text, $columnbeg, $firstlineoffset, $columnend) = @_; 316 317 # Spaces are put *before* words, so treat the column beginning as 1 smaller than it is 318 $columnbeg = $columnbeg - 1; 319 320 # Add some padding (if needed) for the first line 321 local $linelength = $columnbeg; 322 if ($firstlineoffset < $columnbeg) { 323 print STDERR " " x ($columnbeg - $firstlineoffset); 324 } 325 else { 326 $linelength = $firstlineoffset; 327 } 328 329 # Break the text into words, and display one at a time 330 local @words = split(/ /, $text); 331 332 foreach $word (@words) { 333 # Unescape '<' and '>' characters 334 $word =~ s/</</g; 335 $word =~ s/>/>/g; 336 337 # If printing this word would except the column end, start a new line 338 if (($linelength + length($word)) >= $columnend) { 339 print STDERR "\n"; 340 print STDERR " " x $columnbeg; 341 $linelength = $columnbeg; 342 } 343 344 # Write the word 345 print STDERR " " . $word; 346 $linelength = $linelength + (length($word) + 1); 347 } 348 349 print STDERR "\n"; 350 } 351 352 198 353 sub print_general_usage { 199 354 my ($plugin_name) = @_; … … 316 471 q^process_exp/.*/^, \$self->{'process_exp'}, 317 472 q^block_exp/.*/^, \$self->{'block_exp'}, 473 q^extract_language^, \$self->{'extract_language'}, 318 474 q^extract_acronyms^, \$self->{'extract_acronyms'}, 319 q^extract_keyphrases^, \$self->{'kea'}, #with extra options 320 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options 475 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED) 476 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED) 321 477 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 322 478 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
Note:
See TracChangeset
for help on using the changeset viewer.