Changeset 4873 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2003-07-03T15:59:04+12:00 (21 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r4845 r4873 45 45 use printusage; 46 46 47 my $unicode_list = 47 my $unicode_list = 48 48 [ { 'name' => "auto", 49 'desc' => " Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." },49 'desc' => "{BasPlug.input_encoding.auto}" }, 50 50 { 'name' => "ascii", 51 'desc' => " Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },51 'desc' => "{BasPlug.input_encoding.ascii}" }, 52 52 { 'name' => "utf8", 53 'desc' => " either utf8 or unicode -- automatically detected." },53 'desc' => "{BasPlug.input_encoding.utf8}" }, 54 54 { 'name' => "unicode", 55 'desc' => " just unicode" } ];56 57 my $arguments = 55 'desc' => "{BasPlug.input_encoding.unicode}" } ]; 56 57 my $arguments = 58 58 [ { 'name' => "process_exp", 59 'desc' => " A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",59 'desc' => "{BasPlug.process_exp}", 60 60 'type' => "string", 61 61 'deft' => "", 62 62 'reqd' => "no" }, 63 63 { 'name' => "block_exp", 64 'desc' => " Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",65 'type' => 'string',64 'desc' => "{BasPlug.block_exp}", 65 'type' => "string", 66 66 'deft' => "", 67 67 'reqd' => "no" }, 68 68 { 'name' => "input_encoding", 69 'desc' => " The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.",69 'desc' => "{BasPlug.input_encoding}", 70 70 'type' => "enum", 71 71 'list' => $unicode_list, … … 73 73 'deft' => "auto" } , 74 74 { 'name' => "default_encoding", 75 'desc' => " Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.",75 'desc' => "{BasPlug.default_encoding}", 76 76 'type' => "enum", 77 77 'reqd' => "no", 78 78 'deft' => "utf8" }, 79 79 { 'name' => "extract_language", 80 'desc' => " Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",80 'desc' => "{BasPlug.extract_language}", 81 81 'type' => "flag", 82 82 'reqd' => "no" }, 83 83 { 'name' => "default_language", 84 'desc' => " If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.",84 'desc' => "{BasPlug.default_language}", 85 85 'type' => "language", 86 86 'deft' => "en", 87 87 'reqd' => "no" }, 88 88 { 'name' => "extract_acronyms", 89 'desc' => " Extract acronyms from within text and set as metadata.",89 'desc' => "{BasPlug.extract_acronyms}", 90 90 'type' => "flag", 91 91 'reqd' => "no" }, 92 92 { 'name' => "markup_acronyms", 93 'desc' => " Add acronym metadata into document text.",93 'desc' => "{BasPlug.markup_acronyms}", 94 94 'type' => "flag", 95 95 'reqd' => "no" }, 96 96 { 'name' => "first", 97 'desc' => " Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.",97 'desc' => "{BasPlug.first}", 98 98 'type' => "string", 99 99 'reqd' => "no" }, 100 100 { 'name' => "extract_email", 101 'desc' => " Extract email addresses as metadata.",101 'desc' => "{BasPlug.extract_email}", 102 102 'type' => "flag", 103 103 'reqd' => "no" }, 104 104 { 'name' => "extract_historical_years", 105 'desc' => " Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.",105 'desc' => "{BasPlug.extract_historical_years}", 106 106 'type' => "flag", 107 107 'reqd' => "no" }, 108 108 { 'name' => "maximum_year", 109 'desc' => " The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",109 'desc' => "{BasPlug.maximum_year}", 110 110 'type' => "int", 111 111 'deft' => (localtime)[5]+1900, 112 112 'reqd' => "no"}, 113 113 { 'name' => "maximum_century", 114 'desc' => " The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",114 'desc' => "{BasPlug.maximum_century}", 115 115 'type' => "int", 116 116 'deft' => "-1", 117 117 'reqd' => "no" }, 118 118 { 'name' => "no_bibliography", 119 'desc' => " Do not try to block bibliographic dates when extracting historical dates.",119 'desc' => "{BasPlug.no_bibliography}", 120 120 'type' => "flag", 121 121 'reqd' => "no"}, 122 122 { 'name' => "cover_image", 123 'desc' => " Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.",123 'desc' => "{BasPlug.cover_image}", 124 124 'type' => "flag", 125 125 'reqd' => "no" } ]; … … 131 131 132 132 133 sub get_arguments 134 { 135 local $self = shift(@_); 136 local $optionlistref = $self->{'option_list'}; 137 local @optionlist = @$optionlistref; 138 local $pluginoptions = pop(@$optionlistref); 139 local $pluginarguments = $pluginoptions->{'args'}; 140 return $pluginarguments; 141 } 142 143 133 144 sub print_xml_usage 134 145 { 135 146 local $self = shift(@_); 136 137 print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"; 138 $self->print_xml(); 147 local $language = shift(@_); 148 149 &PrintUsage::print_xml_header(); 150 $self->print_xml($language); 139 151 } 140 152 … … 143 155 { 144 156 local $self = shift(@_); 157 local $language = shift(@_); 145 158 146 159 local $optionlistref = $self->{'option_list'}; … … 155 168 print STDERR " <Arguments>\n"; 156 169 if (defined($pluginoptions->{'args'})) { 157 &PrintUsage::print_options_xml($ pluginoptions->{'args'});170 &PrintUsage::print_options_xml($language, $pluginoptions->{'args'}); 158 171 } 159 172 160 173 # Recurse up the plugin hierarchy 161 $self->print_xml( );174 $self->print_xml($language); 162 175 163 176 print STDERR " </Arguments>\n"; … … 169 182 { 170 183 local $self = shift(@_); 184 local $language = shift(@_); 171 185 172 186 # Print the usage message for a plugin (recursively) 173 187 local $descoffset = $self->determine_description_offset(0); 174 $self->print_plugin_usage($ descoffset, 1);188 $self->print_plugin_usage($language, $descoffset, 1); 175 189 } 176 190 … … 205 219 { 206 220 local $self = shift(@_); 221 local $language = shift(@_); 207 222 local $descoffset = shift(@_); 208 223 local $isleafclass = shift(@_); … … 234 249 235 250 # Display the plugin options 236 &PrintUsage::print_options_txt($ pluginargs, $optiondescoffset);251 &PrintUsage::print_options_txt($language, $pluginargs, $optiondescoffset); 237 252 } 238 253 239 254 # Recurse up the plugin hierarchy 240 $self->print_plugin_usage($ descoffset, 0);255 $self->print_plugin_usage($language, $descoffset, 0); 241 256 $self->{'option_list'} = \@optionlist; 242 257 } … … 380 395 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n"; 381 396 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n"; 382 # &print_general_usage($plugin_name); 383 $self->print_txt_usage(); 397 $self->print_txt_usage(""); # Use default resource bundle 384 398 die "\n"; 385 399 }
Note:
See TracChangeset
for help on using the changeset viewer.