Context Navigation

← Previous Change
Next Change →

BasPlug.pm

Timestamp:

2001-01-19T10:35:13+13:00 (23 years ago)

Author:

sjboddie

Message:

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

File:

: 1 edited

trunk/gsdl/perllib/plugins/BasPlug.pm (modified) (20 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1838
+              r1844
 use diagnostics;
 use DateExtract;
+use iso639;
+# if textcat returns an encoding that isn't in this list
+# we'll print a warning and use the default encoding instead
+%supported_encodings = (
+            "ascii" => "",
+            "iso_8859_1" => "",
+            "windows_1252" => "",
+            "iso_8859_2" => "",
+            "windows_1250" => "",
+            "iso_8859_3" => "",
+            "iso_8859_4" => "",
+            "iso_8859_5" => "",
+            "windows_1251" => "",
+            "koi8_r" => "",
+            "koi8_u" => "",
+            "iso_8859_6" => "",
+            "windows_1256" => "",
+            "iso_8859_7" => "",
+            "windows_1253" => "",
+            "iso_8859_8" => "",
+            "windows_1255" => "",
+            "iso_8859_9" => "",
+            "windows_1254" => "",
+            "gb" => ""
+            );
 sub print_general_usage {
 …
     print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
+    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
+    print STDERR "                     converted from these encodings and stored internally as\n";
+    print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n";
+    print STDERR "                     are:\n";
+    print STDERR "                        iso_8859_1 (extended ascii)\n";
+    print STDERR "                        Latin1 (the same as iso-8859-1)\n";
+    print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
+    print STDERR "                               conversion is neccessary)\n";
+    print STDERR "                        gb (GB or GBK simplified Chinese)\n";
+    print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
+    print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
+    print STDERR "                        Arabic (the same as windows_1256)\n";
+    print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
+    print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
+    print STDERR "                                 detection)\n";
+    print STDERR "                        windows_1251 (Windows codepage 1251 (Cyrillic))\n";
     print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
     print STDERR "                     Matching filenames will be processed by this plugin.\n";
     print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
     print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
+    print STDERR "                     .htm or .html (case-insensitive).\n";
+    print STDERR "                     .htm or .html (case-insensitive).\n\n";
     print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
     print STDERR "                     being passed to any further plugins in the list. This has no\n";
 …
     print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
     print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n";
+    print STDERR "                     file extensions.\n";
+    print STDERR "                     file extensions.\n\n";
+    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
+    print STDERR "                     converted from these encodings and stored internally as\n";
+    print STDERR "                     utf8. The default input_encoding is 'auto'. Accepted values\n";
+    print STDERR "                     are:\n";
+    print STDERR "                       auto: Use text categorization algorithm to automatically\n";
+    print STDERR "                         identify the encoding of each source document. This\n";
+    print STDERR "                         will be slower than explicitly setting the encoding\n";
+    print STDERR "                         but will work where more than one encoding is used\n";
+    print STDERR "                         within the same collection.\n";
+    print STDERR "                       ascii: Plain 7 bit ascii. This may be a little faster than\n";
+    print STDERR "                         using iso_8859_1. Beware of using 'ascii' on a collection\n";
+    print STDERR "                         of documents that may contain characters outside of plain\n";
+    print STDERR "                         7 bit ascii though (e.g. German or French documents\n";
+    print STDERR "                         containing accents), use iso_8859_1 instead.\n";
+    print STDERR "                       utf8: either utf8 or unicode -- automatically detected\n";
+    print STDERR "                       unicode: just unicode\n";
+    print STDERR "                       iso_8859_1: Latin1 (western european languages)\n";
+    print STDERR "                       windows_1252: Windows codepage 1252 (WinLatin1)\n";
+    print STDERR "                       iso_8859_2: Latin2 (central and eastern european languages)\n";
+    print STDERR "                       windows_1250: Windows codepage 1250 (WinLatin2)\n";
+    print STDERR "                       iso_8859_3: Latin3\n";
+    print STDERR "                       iso_8859_4: Latin4\n";
+    print STDERR "                       iso_8859_5: Cyrillic\n";
+    print STDERR "                       windows_1251: Windows codepage 1251 (WinCyrillic)\n";
+    print STDERR "                       koi8_r: Cyrillic - Russian\n";
+    print STDERR "                       koi8_u: Cyrillic - Ukrainian\n";
+    print STDERR "                       iso_8859_6: Arabic\n";
+    print STDERR "                       windows_1256: Windows codepage 1256 (WinArabic)\n";
+    print STDERR "                       iso_8859_7: Greek\n";
+    print STDERR "                       windows_1253: Windows codepage 1253 (WinGreek)\n";
+    print STDERR "                       iso_8859_8: Hebrew\n";
+    print STDERR "                       windows_1255: Windows codepage 1255 (WinHebrew)\n";
+    print STDERR "                       iso_8859_9: Latin5\n";
+    print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n";
+    print STDERR "                       gb: GB or GBK simplified Chinese\n\n";
+    print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
+    print STDERR "                     algorithm fails to extract the encoding or extracts an encoding\n";
+    print STDERR "                     that is not supported by Greenstone, this encoding will be used\n";
+    print STDERR "                     instead. The default is iso_8859_1\n\n";
+    print STDERR "   -extract_language Identify the language of each document and set 'Language' metadata. Note\n";
+    print STDERR "                     that this will be done automatically if -input_encoding is 'auto'.\n";
+    print STDERR "   -default_language If Greenstone fails to work out what language a document is the\n";
+    print STDERR "                     'Language' metadata element will be set to this value. The default\n";
+    print STDERR "                     is 'en' (ISO 639 language symbols should be used - en = English).\n";
+    print STDERR "                     Note that if -input_encoding is not set to 'auto' and -extract_language\n";
+    print STDERR "                     is not set, all documents will have their 'Language' metadata set to\n";
+    print STDERR "                     this value.\n\n";
     print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
+    print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n";
+    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
+    print STDERR "   -first            Comma seperated list of first sizes to extract from the text \n";
+    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n";
+    print STDERR "   -markup_acronyms  Add acronym metadata into document text\n\n";
+    print STDERR "   -first            Comma seperated list of first sizes to extract from the text\n";
+    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n\n";
     print STDERR "   -extract_email    Extract email addresses as metadata\n\n";
     print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n\n";
+}
 …
     my $class = shift (@_);
     my $plugin_name = shift (@_);
     my $self = {};
+    my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode|windows_1251)\$";
+    my $enc = "^(";
+    map {$enc .= "|$_";} keys %supported_encodings;
+    my $denc = $enc . "|utf8|unicode)\$";
+    $enc .= "|utf8|unicode|auto)\$";
     $self->{'outhandle'} = STDERR;
     my $year = (localtime)[5]+1900;
 …
     # general options available to all plugins
     if (!parsargv::parse(\@_,
-             qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
              q^process_exp/.*/^, \$self->{'process_exp'},
              q^block_exp/.*/^, \$self->{'block_exp'},
+             qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
+             qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
              q^extract_acronyms^, \$self->{'extract_acronyms'},
              q^extract_email^, \$self->{'extract_email'},
              q^markup_acronyms^, \$self->{'markup_acronyms'},
              q^extract_language^, \$self->{'extract_language'},
+             q^default_language/.{2}/en^, \$self->{'default_language'},
              q^first/.*/^, \$self->{'first'},
              q^extract_date^, \$self->{'date_extract'},
              "maximum_date/\\d{4}/$year", \$self->{'max_year'},
+             qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
              q^no_bibliography^, \$self->{'no_biblio'},
+             "maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1",
+             \$self->{'max_century'},
+             qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
              "allow_extra_options")) {
 …
     $self->{'block_exp'} = $self->get_default_block_exp ();
+    }
-    # handle input_encoding aliases
-    $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
-    $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
+}
 …
+    }
+    my $outhandle = $self->{'outhandle'};
     my $filename = &util::filename_cat($base_dir, $file);
     return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
 …
     my $plugin_name = ref ($self);
     $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
+    my ($language, $encoding);
+    if ($self->{'input_encoding'} eq "auto") {
+    # use textcat to automatically work out the input encoding and language
+    ($language, $encoding) = $self->get_language_encoding ($filename);
+    } elsif ($self->{'extract_language'}) {
+    # use textcat to get language metadata
+    ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
+    $encoding = $self->{'input_encoding'};
+    if ($extracted_encoding != $encoding && $self->{'verbosity'}) {
+        print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
+        print $outhandle "appears to be encoded as $extracted_encoding.";
+    }
+    } else {
+    $language = $self->{'default_language'};
+    $encoding = $self->{'input_encoding'};
+    }
     # create a new document
     my $doc_obj = new doc ($filename, "indexed_doc");
+    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
+    $doc_obj->set_source_encoding ($encoding);
     # read in file ($text will be in utf8)
     my $text = "";
+    $self->read_file ($filename, \$text);
+    if ($text !~ /\w/) {
+    my $outhandle = $self->{'outhandle'};
+    $self->read_file ($filename, $encoding, \$text);
+    if (!length ($text)) {
     print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
     return 0;
 …
 sub read_file {
     my $self = shift (@_);
     my ($filename, $textref) = @_;
+    my ($filename, $encoding, $textref) = @_;
     if (!-r $filename)
+    {
+    print STDERR "Read permission denied for $filename\n";
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
     return;
+    }
 …
     open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
     if ($self->{'input_encoding'} eq "ascii") {
+    if ($encoding eq "ascii") {
     undef $/;
     $$textref = <FILE>;
 …
     my $reader = new multiread();
     $reader->set_handle ('BasPlug::FILE');
     $reader->set_encoding ($self->{'input_encoding'});
+    $reader->set_encoding ($encoding);
     $reader->read_file ($textref);
     if ($self->{'input_encoding'} eq "gb") {
+    if ($encoding eq "gb") {
         # segment the Chinese words
         $$textref = &cnseg::segment($$textref);
 …
     close FILE;
+}
+# Uses textcat to work out the encoding and language of the text in
+# $filename. All html tags are removed before processing.
+# returns an array containing "language" and "encoding"
+sub get_language_encoding {
+    my $self = shift (@_);
+    my ($filename) = @_;
+    my $outhandle = $self->{'outhandle'};
+    # read in file
+    open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
+    undef $/;
+    my $text = <FILE>;
+    $/ = "\n";
+    close FILE;
+    # remove all HTML tags
+    $text =~ s/<[^>]*>//sg;
+    # get the language/encoding
+    my @results = textcat::classify($text);
+#    foreach $i (@results) {
+#   print STDERR "i: $i\n";
+#    }
+    if (scalar @results != 1) {
+    if ($self->{'input_encoding'} ne 'auto') {
+        if ($self->{'extract_language'} && $self->{'verbosity'}) {
+        print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
+        print $outhandle "defaulting to $self->{'default_language'}\n";
+        }
+        return ($self->{'default_language'}, $self->{'input_encoding'});
+    } else {
+        if ($self->{'verbosity'}) {
+        print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
+        print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
+        }
+        return ($self->{'default_language'}, $self->{'default_encoding'});
+    }
+    }
+    # format language/encoding
+    my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
+    $language = $iso639::toiso639{lc($language)};
+    die "Invalid language\n" if !defined $language;
+    if (!defined $encoding) {
+    # if textcat returned no encoding info it is assumed to be iso_8859_1
+    $encoding = "iso_8859_1";
+    } else {
+    # convert to the format we expect
+    $encoding =~ s/windows/windows_/;
+    $encoding =~ s/iso8859/iso_8859/;
+    $encoding =~ s/^gb.*$/gb/;
+    }
+    if (!defined $supported_encodings{$encoding}) {
+    if ($self->{'verbosity'}) {
+        print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
+        print $outhandle "using $self->{'default_encoding'}\n";
+    }
+    $encoding = $self->{'default_encoding'};
+    }
+    return ($language, $encoding);
+}
 …
     print $outhandle " extracting email addresses ...\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g);
 …
         $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
         print $outhandle "  extracting $address\n"
         if ($self->{'verbosity'} >= 3);
+        if ($self->{'verbosity'} > 3);
+    }
+    }
     print $outhandle " done extracting email addresses.\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
+}
 …
+}
-# Identify the language of a section and add it to the metadata
-sub extract_language {
-    my $self = shift (@_);
-    my ($textref, $doc_obj, $thissection) = @_;
-    # remove all HTML tags
-    my $text = $$textref;
-    $text =~ s/<P[^>]*>/\n/sgi;
-    $text =~ s/<H[^>]*>/\n/sgi;
-    $text =~ s/<[^>]*>//sgi;
-    $text =~ tr/\n/\n/s;
-    # get the language
-    my @results = textcat::classify($text);
-    @results = ("unknown") if ($#results > 2);
-    # create language string and remove encoding information
-    my $language = join(" or ", @results);
-    $language =~ s/\-\w+//g;
-    $doc_obj->add_utf8_metadata($thissection, "Language",  $language);
-    # print "Language: ", time, "-> $language\n";
+}
 # extract acronyms from a section in a document. progress is
 # reported to outhandle based on the verbosity. both the Acronym
 …
     print $outhandle " extracting acronyms ...\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     my $acro_array =  &acronym::acronyms($textref);
 …
         $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
         print $outhandle "  adding ". $acro->to_string() . "\n"
         if ($self->{'verbosity'} >= 3);
+        if ($self->{'verbosity'} > 3);
+    }
+    }
     print $outhandle " done extracting acronyms. \n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
+}
 …
     print $outhandle " marking up acronyms ...\n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     #self is passed in to check for verbosity ...
 …
     print $outhandle " done marking up acronyms. \n"
     if ($self->{'verbosity'} >= 2);
+    if ($self->{'verbosity'} > 2);
     return $text;
 …
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1844 for trunk/gsdl/perllib/plugins/BasPlug.pm

Legend:

trunk/gsdl/perllib/plugins/BasPlug.pm

Download in other formats: