Changeset 1244 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2000-06-27T17:10:07+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r1242 r1244 33 33 use doc; 34 34 35 sub print_ usage {35 sub print_general_usage { 36 36 my ($plugin_name) = @_; 37 37 38 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";39 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";40 41 38 print STDERR "\n usage: plugin $plugin_name [options]\n\n"; 42 print STDERR " currently supported general options are:\n";43 39 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 44 40 print STDERR " converted from these encodings and stored internally as\n"; … … 71 67 } 72 68 69 # print_usage should be overridden for any sub-classes having 70 # their own plugin specific options 71 sub print_usage { 72 print STDERR "\nThis plugin has no plugin specific options\n\n"; 73 74 } 75 73 76 sub new { 74 77 my $class = shift (@_); … … 85 88 q^extract_acronyms^, \$self->{'extract_acronyms'}, 86 89 "allow_extra_options")) { 87 &print_usage($plugin_name); 90 91 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n"; 92 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n"; 93 &print_general_usage($plugin_name); 88 94 die "\n"; 89 95 } … … 103 109 # set process_exp and block_exp to defaults unless they were 104 110 # explicitly set 105 if ((!$self->is_recursive()) && 111 112 if ((!$self->is_recursive()) and 106 113 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) { 107 114 108 115 $self->{'process_exp'} = $self->get_default_process_exp (); 109 116 if ($self->{'process_exp'} eq "") { 110 warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n";117 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n"; 111 118 } 112 119 } … … 115 122 $self->{'block_exp'} = $self->get_default_block_exp (); 116 123 } 124 125 # handle input_encoding aliases 126 $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1"; 127 $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic"; 117 128 } 118 129 … … 152 163 # process() function and let this read() function keep control. 153 164 # 165 # recursive plugins (e.g. RecPlug) and specialized plugins like those 166 # capable of processing many documents within a single file (e.g. 167 # GMLPlug) should normally implement their own version of read() 168 # 154 169 # Return number of files processed, undef if can't process 155 170 # Note that $base_dir might be "" and that $file might … … 165 180 166 181 my $filename = &util::filename_cat($base_dir, $file); 167 return 0 if $ filename =~ /$self->{'block_exp'}/;182 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; 168 183 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 169 184 return undef; … … 174 189 # create a new document 175 190 my $doc_obj = new doc ($file, "indexed_doc"); 176 my $cursection =177 191 178 192 # read in file ($text will be in utf8) … … 190 204 191 205 # do plugin specific processing of doc_obj 192 $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj); 193 194 # add text 195 $doc_obj->add_utf8_text ($cursection, $text); 206 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); 196 207 197 208 # do any automatic metadata extraction … … 207 218 } 208 219 220 # returns undef if file is rejected by the plugin 209 221 sub process { 210 222 my $self = shift (@_); … … 212 224 213 225 die "Basplug::process function must be implemented in sub-class\n"; 226 227 return undef; # never gets here 214 228 } 215 229 … … 223 237 224 238 $$textref = ""; 225 my $encoding = "";226 if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {227 $encoding = "iso_8859_1";228 } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {229 $encoding = "windows_1256";230 } else {231 $encoding = $self->{'input_encoding'};232 }233 239 234 240 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n"; 235 241 236 if ($ encodingeq "ascii") {242 if ($self->{'input_encoding'} eq "ascii") { 237 243 undef $/; 238 244 $$textref = <FILE>; … … 241 247 my $reader = new multiread(); 242 248 $reader->set_handle ('BasPlug::FILE'); 243 $reader->set_encoding ($ encoding);249 $reader->set_encoding ($self->{'input_encoding'}); 244 250 $reader->read_file ($textref); 245 251 246 if ($ encodingeq "gb") {252 if ($self->{'input_encoding'} eq "gb") { 247 253 # segment the Chinese words 248 254 $$textref = &cnseg::segment($$textref);
Note:
See TracChangeset
for help on using the changeset viewer.