Changeset 1317

Show
Ignore:
Timestamp:
01.08.2000 16:41:47 (19 years ago)
Author:
paynter
Message:

Added -extract_language option, which uses the textcat language
identification package to identify the language a document is written
in and add this information to the "language" metadata.
Also, mane ascii the default encoding (instead of Latin1).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1244 r1317  
    3030use cnseg; 
    3131use acronym; 
     32use textcat; 
    3233use strict; 
    3334use doc; 
     
    3940    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n"; 
    4041    print STDERR "                     converted from these encodings and stored internally as\n"; 
    41     print STDERR "                     utf8. The default input_encoding is Latin1. Accepted values\n"; 
     42    print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n"; 
    4243    print STDERR "                     are:\n"; 
    4344    print STDERR "                        iso_8859_1 (extended ascii)\n"; 
     
    6566    print STDERR "                     file extensions.\n"; 
    6667    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n"; 
     68    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n"; 
    6769} 
    6870 
     
    8385    # general options available to all plugins 
    8486    if (!parsargv::parse(\@_, 
    85              qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'}, 
     87             qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'}, 
    8688             q^process_exp/.*/^, \$self->{'process_exp'}, 
    8789             q^block_exp/.*/^, \$self->{'block_exp'}, 
    8890             q^extract_acronyms^, \$self->{'extract_acronyms'}, 
     91             q^extract_language^, \$self->{'extract_language'}, 
    8992             "allow_extra_options")) { 
    9093 
     
    292295    } 
    293296    } 
     297 
     298    if ($self->{'extract_language'}) { 
     299    my $thissection = $doc_obj->get_top_section(); 
     300    while (defined $thissection) { 
     301        my $text = $doc_obj->get_text($thissection); 
     302        $self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./; 
     303        $thissection = $doc_obj->get_next_section ($thissection); 
     304    } 
     305    } 
     306 
     307} 
     308 
     309 
     310# Identify the language of a section and add it to the metadata 
     311sub extract_language { 
     312    my $self = shift (@_); 
     313    my ($textref, $doc_obj, $thissection) = @_; 
     314 
     315    # remove all HTML tags 
     316    my $text = $$textref; 
     317    $text =~ s/<P[^>]*>/\n/sgi; 
     318    $text =~ s/<H[^>]*>/\n/sgi; 
     319    $text =~ s/<[^>]*>//sgi; 
     320    $text =~ tr/\n/\n/s; 
     321 
     322    # get the language 
     323    my @results = textcat::classify($text); 
     324    @results = ("unknown") if ($#results > 2); 
     325 
     326    my $language = join(" or ", @results); 
     327    $doc_obj->add_utf8_metadata($thissection, "Language",  $language); 
     328    print "Language: $language\n"; 
     329 
    294330} 
    295331