Changeset 1317
- Timestamp:
- 2000-08-01T16:41:47+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r1244 r1317 30 30 use cnseg; 31 31 use acronym; 32 use textcat; 32 33 use strict; 33 34 use doc; … … 39 40 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 40 41 print STDERR " converted from these encodings and stored internally as\n"; 41 print STDERR " utf8. The default input_encoding is Latin1. Accepted values\n";42 print STDERR " utf8. The default input_encoding is ascii. Accepted values\n"; 42 43 print STDERR " are:\n"; 43 44 print STDERR " iso_8859_1 (extended ascii)\n"; … … 65 66 print STDERR " file extensions.\n"; 66 67 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n"; 68 print STDERR " -extract_langauge Identify the language of the text and set as metadata\n\n"; 67 69 } 68 70 … … 83 85 # general options available to all plugins 84 86 if (!parsargv::parse(\@_, 85 qq^input_encoding/$encodings/ Latin1^, \$self->{'input_encoding'},87 qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'}, 86 88 q^process_exp/.*/^, \$self->{'process_exp'}, 87 89 q^block_exp/.*/^, \$self->{'block_exp'}, 88 90 q^extract_acronyms^, \$self->{'extract_acronyms'}, 91 q^extract_language^, \$self->{'extract_language'}, 89 92 "allow_extra_options")) { 90 93 … … 292 295 } 293 296 } 297 298 if ($self->{'extract_language'}) { 299 my $thissection = $doc_obj->get_top_section(); 300 while (defined $thissection) { 301 my $text = $doc_obj->get_text($thissection); 302 $self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./; 303 $thissection = $doc_obj->get_next_section ($thissection); 304 } 305 } 306 307 } 308 309 310 # Identify the language of a section and add it to the metadata 311 sub extract_language { 312 my $self = shift (@_); 313 my ($textref, $doc_obj, $thissection) = @_; 314 315 # remove all HTML tags 316 my $text = $$textref; 317 $text =~ s/<P[^>]*>/\n/sgi; 318 $text =~ s/<H[^>]*>/\n/sgi; 319 $text =~ s/<[^>]*>//sgi; 320 $text =~ tr/\n/\n/s; 321 322 # get the language 323 my @results = textcat::classify($text); 324 @results = ("unknown") if ($#results > 2); 325 326 my $language = join(" or ", @results); 327 $doc_obj->add_utf8_metadata($thissection, "Language", $language); 328 print "Language: $language\n"; 329 294 330 } 295 331
Note:
See TracChangeset
for help on using the changeset viewer.