Changeset 3540 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2002-11-18T17:43:56+13:00 (21 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r3515 r3540 42 42 use ghtml; 43 43 44 my $unicode_list = 45 [ { 'name' => "auto", 46 'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } , 47 { 'name' => "ascii", 48 'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." }, 49 { 'name' => "utf8", 50 'desc' => "either utf8 or unicode -- automatically detected." }, 51 { 'name' => "unicode", 52 'desc' => "just unicode" } ]; 53 54 my $arguments = 55 [ { 'name' => "process_exp", 56 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 57 'type' => "string", 58 'deft' => "", 59 'reqd' => "no" }, 60 { 'name' => "block_exp", 61 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 62 'type' => 'string', 63 'deft' => "", 64 'reqd' => "no" }, 65 { 'name' => "input_encoding", 66 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.", 67 'type' => "enum", 68 'list' => $unicode_list, 69 'reqd' => "no" , 70 'deft' => "auto" } , 71 { 'name' => "default_encoding", 72 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone. The default is iso_8859_1.", 73 'type' => "flag", 74 'reqd' => "no" }, 75 { 'name' => "extract_language", 76 'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.", 77 'type' => "flag", 78 'reqd' => "no" }, 79 { 'name' => "default_language", 80 'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.", 81 'type' => "language", 82 'deft' => "en", 83 'reqd' => "no" }, 84 { 'name' => "extract_acronyms", 85 'desc' => "Extract acronyms from within text and set as metadata.", 86 'type' => "flag", 87 'reqd' => "no" }, 88 { 'name' => "markup_acronyms", 89 'desc' => "Add acronym metadata into document text.", 90 'type' => "flag", 91 'reqd' => "no" }, 92 { 'name' => "first", 93 'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.", 94 'type' => "string", 95 'reqd' => "no" }, 96 { 'name' => "extract_email", 97 'desc' => "Extract email addresses as metadata.", 98 'type' => "flag", 99 'reqd' => "no" }, 100 { 'name' => "extract_historical_years", 101 'desc' => "Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.", 102 'type' => "flag", 103 'reqd' => "no" }, 104 { 'name' => "maximum_year", 105 'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).", 106 'type' => "int", 107 'reqd' => "no"}, 108 { 'name' => "maximum_century", 109 'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).", 110 'type' => "int", 111 'reqd' => "no" }, 112 { 'name' => "no_bibliography", 113 'desc' => "Do not try and block bibliographic dates when extracting historical dates.", 114 'type' => "flag", 115 'reqd' => "no"}, 116 { 'name' => "cover_image", 117 'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.", 118 'type' => "flag", 119 'reqd' => "no" } ]; 120 121 my $options = { 'name' => "BasPlug", 122 'desc' => "Base class for all the import plugins.", 123 'inherits' => "No", 124 'args' => $arguments, 125 'process_exp' => "", 126 'block_exp' => "" }; 127 128 sub print_xml_usage { 129 my $self = shift (@_); 130 print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"; 131 $self->print_xml(); 132 } 133 134 sub print_xml { 135 my $self = shift (@_); 136 my $option_list = $self->{'option_list'}; 137 my $option = pop( @{$option_list} ); 138 if(defined $option) 139 { 140 print STDERR "<PlugInfo>\n"; 141 print STDERR " <Name>$option->{'name'}</Name>\n"; 142 print STDERR " <Desc>$option->{'desc'}</Desc>\n"; 143 print STDERR " <Inherits>$option->{'inherits'}</Inherits>\n"; 144 print STDERR " <Arguments>\n"; 145 if(defined $option->{'args'}) 146 { 147 my $args = $option->{'args'}; 148 my $x; 149 foreach $x ( @{$args} ) 150 { 151 print STDERR " <Option>\n"; 152 print STDERR " <Name>$x->{'name'}</Name>\n"; 153 print STDERR " <Desc>$x->{'desc'}</Desc>\n"; 154 print STDERR " <Type>$x->{'type'}</Type>\n"; 155 print STDERR " <Required>$x->{'reqd'}</Required>\n"; 156 if(defined $x->{'list'}) 157 { 158 print STDERR " <List>\n"; 159 my $list = $x->{'list'}; 160 my $y; 161 foreach $y ( @{$list} ) 162 { 163 print STDERR " <Value>\n"; 164 print STDERR " <Name>$y->{'name'}</Name>\n"; 165 print STDERR " <Desc>$y->{'desc'}</Desc>\n"; 166 print STDERR " </Value>\n"; 167 } 168 # Special case of 'input_encoding' 169 if( $x->{'name'} =~ m/^input_encoding$/i ) { 170 my $e = $encodings::encodings; 171 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) { 172 print STDERR " <Value>\n"; 173 print STDERR " <Name>$enc</Name>\n"; 174 print STDERR " <Desc>$e->{$enc}->{'name'}</Desc>\n"; 175 print STDERR " </Value>\n"; 176 } 177 } 178 print STDERR " </List>\n"; 179 } 180 if(defined $x->{'deft'}) 181 { 182 print STDERR " <Default>$x->{'deft'}</Default>\n"; 183 } 184 print STDERR " </Option>\n"; 185 } 186 } 187 if(defined $option_list) { 188 $self->print_xml(); 189 } 190 191 print STDERR " </Arguments>\n"; 192 print STDERR "</PlugInfo>\n"; 193 } 194 } 195 44 196 sub print_general_usage { 45 197 my ($plugin_name) = @_; … … 154 306 $self->{'num_blocked'} = 0; 155 307 $self->{'num_archives'} = 0; 308 309 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 310 $self->{'option_list'} = [ $options ]; 156 311 157 312 # general options available to all plugins
Note:
See TracChangeset
for help on using the changeset viewer.