Changeset 2027
- Timestamp:
- 2001-02-20T15:57:37+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r1974 r2027 215 215 216 216 # Override BasPlug read 217 217 # We don't want to get language encoding stuff until after we've converted 218 # our file to either TEXT or HTML. 218 219 sub read { 219 220 my $self = shift (@_); 220 221 my $ret_val = BasPlug::read($self,@_); 222 221 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 222 # if ($self->is_recursive()) { 223 # die "BasPlug::read function must be implemented in sub-class for recursive plugins\n"; 224 # } 225 226 my $outhandle = $self->{'outhandle'}; 227 228 my $filename = &util::filename_cat($base_dir, $file); 229 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; 230 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 231 return undef; 232 } 233 my $plugin_name = ref ($self); 234 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 235 236 237 # read in file ($text will be in utf8) 238 my $text = ""; 239 # $self->read_file ($filename, $encoding, \$text); 240 ### was read_file 241 # my $self = shift (@_); 242 # my ($src_filename, $encoding, $textref) = @_; 243 244 my $output_ext = $self->{'convert_to_ext'}; 245 my $conv_filename = $self->tmp_area_convert_file($output_ext,$filename); 246 # change following to return undef? 247 if ("$conv_filename" eq "") {return "";} # allows continue on errors 248 $self->{'conv_filename'} = $conv_filename; 249 250 ### was read_file 251 252 253 254 255 # Do encoding stuff 256 my ($language, $encoding); 257 if ($self->{'input_encoding'} eq "auto") { 258 # use textcat to automatically work out the input encoding and language 259 ($language, $encoding) = $self->get_language_encoding ($conv_filename); 260 } elsif ($self->{'extract_language'}) { 261 # use textcat to get language metadata 262 263 my ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename); 264 $encoding = $self->{'input_encoding'}; 265 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) { 266 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but "; 267 print $outhandle "appears to be encoded as $extracted_encoding.\n"; 268 } 269 } else { 270 $language = $self->{'default_language'}; 271 $encoding = $self->{'input_encoding'}; 272 } 273 274 BasPlug::read_file($self,$conv_filename, $encoding, \$text); 275 if (!length ($text)) { 276 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; 277 return 0; 278 } 279 280 # create a new document 281 my $doc_obj = new doc ($conv_filename, "indexed_doc"); 282 283 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", 284 $language); 285 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", 286 $encoding); 287 288 289 # include any metadata passed in from previous plugins 290 # note that this metadata is associated with the top level section 291 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 292 # do plugin specific processing of doc_obj 293 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); 294 # do any automatic metadata extraction 295 $self->auto_extract_metadata ($doc_obj); 296 # add an OID 297 $doc_obj->set_OID(); 298 # process the document 299 $processor->process($doc_obj); 223 300 $self->cleanup_tmp_area(); 224 225 return $ret_val; 301 302 303 return 1; 226 304 } 227 305
Note:
See TracChangeset
for help on using the changeset viewer.