Changeset 1317


Ignore:
Timestamp:
2000-08-01T16:41:47+12:00 (24 years ago)
Author:
paynter
Message:

Added -extract_language option, which uses the textcat language
identification package to identify the language a document is written
in and add this information to the "language" metadata.
Also, mane ascii the default encoding (instead of Latin1).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1244 r1317  
    3030use cnseg;
    3131use acronym;
     32use textcat;
    3233use strict;
    3334use doc;
     
    3940    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
    4041    print STDERR "                     converted from these encodings and stored internally as\n";
    41     print STDERR "                     utf8. The default input_encoding is Latin1. Accepted values\n";
     42    print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n";
    4243    print STDERR "                     are:\n";
    4344    print STDERR "                        iso_8859_1 (extended ascii)\n";
     
    6566    print STDERR "                     file extensions.\n";
    6667    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
     68    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
    6769}
    6870
     
    8385    # general options available to all plugins
    8486    if (!parsargv::parse(\@_,
    85              qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'},
     87             qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
    8688             q^process_exp/.*/^, \$self->{'process_exp'},
    8789             q^block_exp/.*/^, \$self->{'block_exp'},
    8890             q^extract_acronyms^, \$self->{'extract_acronyms'},
     91             q^extract_language^, \$self->{'extract_language'},
    8992             "allow_extra_options")) {
    9093
     
    292295    }
    293296    }
     297
     298    if ($self->{'extract_language'}) {
     299    my $thissection = $doc_obj->get_top_section();
     300    while (defined $thissection) {
     301        my $text = $doc_obj->get_text($thissection);
     302        $self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
     303        $thissection = $doc_obj->get_next_section ($thissection);
     304    }
     305    }
     306
     307}
     308
     309
     310# Identify the language of a section and add it to the metadata
     311sub extract_language {
     312    my $self = shift (@_);
     313    my ($textref, $doc_obj, $thissection) = @_;
     314
     315    # remove all HTML tags
     316    my $text = $$textref;
     317    $text =~ s/<P[^>]*>/\n/sgi;
     318    $text =~ s/<H[^>]*>/\n/sgi;
     319    $text =~ s/<[^>]*>//sgi;
     320    $text =~ tr/\n/\n/s;
     321
     322    # get the language
     323    my @results = textcat::classify($text);
     324    @results = ("unknown") if ($#results > 2);
     325
     326    my $language = join(" or ", @results);
     327    $doc_obj->add_utf8_metadata($thissection, "Language",  $language);
     328    print "Language: $language\n";
     329
    294330}
    295331
Note: See TracChangeset for help on using the changeset viewer.