Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1317

Timestamp:

2000-08-01T16:41:47+12:00 (24 years ago)

Author:

paynter

Message:

Added -extract_language option, which uses the textcat language
identification package to identify the language a document is written
in and add this information to the "language" metadata.
Also, mane ascii the default encoding (instead of Latin1).

File:

: 1 edited

trunk/gsdl/perllib/plugins/BasPlug.pm (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1244
+              r1317
 use cnseg;
 use acronym;
+use textcat;
 use strict;
 use doc;
 …
     print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
     print STDERR "                     converted from these encodings and stored internally as\n";
     print STDERR "                     utf8. The default input_encoding is Latin1. Accepted values\n";
+    print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n";
     print STDERR "                     are:\n";
     print STDERR "                        iso_8859_1 (extended ascii)\n";
 …
     print STDERR "                     file extensions.\n";
     print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
+    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
+}
 …
     # general options available to all plugins
     if (!parsargv::parse(\@_,
              qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'},
+             qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
              q^process_exp/.*/^, \$self->{'process_exp'},
              q^block_exp/.*/^, \$self->{'block_exp'},
              q^extract_acronyms^, \$self->{'extract_acronyms'},
+             q^extract_language^, \$self->{'extract_language'},
              "allow_extra_options")) {
 …
+    }
+    }
+    if ($self->{'extract_language'}) {
+    my $thissection = $doc_obj->get_top_section();
+    while (defined $thissection) {
+        my $text = $doc_obj->get_text($thissection);
+        $self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
+        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
+}
+# Identify the language of a section and add it to the metadata
+sub extract_language {
+    my $self = shift (@_);
+    my ($textref, $doc_obj, $thissection) = @_;
+    # remove all HTML tags
+    my $text = $$textref;
+    $text =~ s/<P[^>]*>/\n/sgi;
+    $text =~ s/<H[^>]*>/\n/sgi;
+    $text =~ s/<[^>]*>//sgi;
+    $text =~ tr/\n/\n/s;
+    # get the language
+    my @results = textcat::classify($text);
+    @results = ("unknown") if ($#results > 2);
+    my $language = join(" or ", @results);
+    $doc_obj->add_utf8_metadata($thissection, "Language",  $language);
+    print "Language: $language\n";
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1317

Legend:

trunk/gsdl/perllib/plugins/BasPlug.pm

Download in other formats: