Ignore:
Timestamp:
2001-01-19T10:35:13+13:00 (23 years ago)
Author:
sjboddie
Message:

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1838 r1844  
    3434use diagnostics;
    3535use DateExtract;
     36use iso639;
     37
     38# if textcat returns an encoding that isn't in this list
     39# we'll print a warning and use the default encoding instead
     40%supported_encodings = (
     41            "ascii" => "",
     42            "iso_8859_1" => "",
     43            "windows_1252" => "",
     44            "iso_8859_2" => "",
     45            "windows_1250" => "",
     46            "iso_8859_3" => "",
     47            "iso_8859_4" => "",
     48            "iso_8859_5" => "",
     49            "windows_1251" => "",
     50            "koi8_r" => "",
     51            "koi8_u" => "",
     52            "iso_8859_6" => "",
     53            "windows_1256" => "",
     54            "iso_8859_7" => "",
     55            "windows_1253" => "",
     56            "iso_8859_8" => "",
     57            "windows_1255" => "",
     58            "iso_8859_9" => "",
     59            "windows_1254" => "",
     60            "gb" => ""
     61            );
    3662
    3763sub print_general_usage {
     
    3965
    4066    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
    41     print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
    42     print STDERR "                     converted from these encodings and stored internally as\n";
    43     print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n";
    44     print STDERR "                     are:\n";
    45     print STDERR "                        iso_8859_1 (extended ascii)\n";
    46     print STDERR "                        Latin1 (the same as iso-8859-1)\n";
    47     print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
    48     print STDERR "                               conversion is neccessary)\n";
    49     print STDERR "                        gb (GB or GBK simplified Chinese)\n";
    50     print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
    51     print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
    52     print STDERR "                        Arabic (the same as windows_1256)\n";
    53     print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
    54     print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
    55     print STDERR "                                 detection)\n";
    56     print STDERR "                        windows_1251 (Windows codepage 1251 (Cyrillic))\n";
     67
    5768    print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
    5869    print STDERR "                     Matching filenames will be processed by this plugin.\n";
    5970    print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
    6071    print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
    61     print STDERR "                     .htm or .html (case-insensitive).\n";
     72    print STDERR "                     .htm or .html (case-insensitive).\n\n";
     73
    6274    print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
    6375    print STDERR "                     being passed to any further plugins in the list. This has no\n";
     
    6678    print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
    6779    print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n";
    68     print STDERR "                     file extensions.\n";
     80    print STDERR "                     file extensions.\n\n";
     81
     82
     83    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
     84    print STDERR "                     converted from these encodings and stored internally as\n";
     85    print STDERR "                     utf8. The default input_encoding is 'auto'. Accepted values\n";
     86    print STDERR "                     are:\n";
     87
     88    print STDERR "                       auto: Use text categorization algorithm to automatically\n";
     89    print STDERR "                         identify the encoding of each source document. This\n";
     90    print STDERR "                         will be slower than explicitly setting the encoding\n";
     91    print STDERR "                         but will work where more than one encoding is used\n";
     92    print STDERR "                         within the same collection.\n";
     93
     94    print STDERR "                       ascii: Plain 7 bit ascii. This may be a little faster than\n";
     95    print STDERR "                         using iso_8859_1. Beware of using 'ascii' on a collection\n";
     96    print STDERR "                         of documents that may contain characters outside of plain\n";
     97    print STDERR "                         7 bit ascii though (e.g. German or French documents\n";
     98    print STDERR "                         containing accents), use iso_8859_1 instead.\n";
     99
     100    print STDERR "                       utf8: either utf8 or unicode -- automatically detected\n";
     101    print STDERR "                       unicode: just unicode\n";
     102
     103    print STDERR "                       iso_8859_1: Latin1 (western european languages)\n";
     104    print STDERR "                       windows_1252: Windows codepage 1252 (WinLatin1)\n";
     105
     106    print STDERR "                       iso_8859_2: Latin2 (central and eastern european languages)\n";
     107    print STDERR "                       windows_1250: Windows codepage 1250 (WinLatin2)\n";
     108
     109    print STDERR "                       iso_8859_3: Latin3\n";
     110
     111    print STDERR "                       iso_8859_4: Latin4\n";
     112
     113    print STDERR "                       iso_8859_5: Cyrillic\n";
     114    print STDERR "                       windows_1251: Windows codepage 1251 (WinCyrillic)\n";
     115    print STDERR "                       koi8_r: Cyrillic - Russian\n";
     116    print STDERR "                       koi8_u: Cyrillic - Ukrainian\n";
     117
     118    print STDERR "                       iso_8859_6: Arabic\n";
     119    print STDERR "                       windows_1256: Windows codepage 1256 (WinArabic)\n";
     120
     121    print STDERR "                       iso_8859_7: Greek\n";
     122    print STDERR "                       windows_1253: Windows codepage 1253 (WinGreek)\n";
     123
     124    print STDERR "                       iso_8859_8: Hebrew\n";
     125    print STDERR "                       windows_1255: Windows codepage 1255 (WinHebrew)\n";
     126
     127    print STDERR "                       iso_8859_9: Latin5\n";
     128    print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n";
     129
     130    print STDERR "                       gb: GB or GBK simplified Chinese\n\n";
     131
     132    print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
     133    print STDERR "                     algorithm fails to extract the encoding or extracts an encoding\n";
     134    print STDERR "                     that is not supported by Greenstone, this encoding will be used\n";
     135    print STDERR "                     instead. The default is iso_8859_1\n\n";
     136
     137    print STDERR "   -extract_language Identify the language of each document and set 'Language' metadata. Note\n";
     138    print STDERR "                     that this will be done automatically if -input_encoding is 'auto'.\n";
     139    print STDERR "   -default_language If Greenstone fails to work out what language a document is the\n";
     140    print STDERR "                     'Language' metadata element will be set to this value. The default\n";
     141    print STDERR "                     is 'en' (ISO 639 language symbols should be used - en = English).\n";
     142    print STDERR "                     Note that if -input_encoding is not set to 'auto' and -extract_language\n";
     143    print STDERR "                     is not set, all documents will have their 'Language' metadata set to\n";
     144    print STDERR "                     this value.\n\n";
     145
    69146    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
    70     print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n";
    71     print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
    72     print STDERR "   -first            Comma seperated list of first sizes to extract from the text \n";
    73     print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n";
     147
     148    print STDERR "   -markup_acronyms  Add acronym metadata into document text\n\n";
     149
     150    print STDERR "   -first            Comma seperated list of first sizes to extract from the text\n";
     151    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n\n";
     152
    74153    print STDERR "   -extract_email    Extract email addresses as metadata\n\n";
     154
    75155    print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n\n";
    76156}
     
    86166    my $class = shift (@_);
    87167    my $plugin_name = shift (@_);
    88 
    89168    my $self = {};
    90     my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode|windows_1251)\$";
     169   
     170    my $enc = "^(";
     171    map {$enc .= "|$_";} keys %supported_encodings;
     172    my $denc = $enc . "|utf8|unicode)\$";
     173    $enc .= "|utf8|unicode|auto)\$";
     174   
    91175    $self->{'outhandle'} = STDERR;
    92176    my $year = (localtime)[5]+1900;
     
    94178    # general options available to all plugins
    95179    if (!parsargv::parse(\@_,
    96              qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
    97180             q^process_exp/.*/^, \$self->{'process_exp'},
    98181             q^block_exp/.*/^, \$self->{'block_exp'},
     182             qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
     183             qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
    99184             q^extract_acronyms^, \$self->{'extract_acronyms'},
    100185             q^extract_email^, \$self->{'extract_email'},
    101186             q^markup_acronyms^, \$self->{'markup_acronyms'},
    102187             q^extract_language^, \$self->{'extract_language'},
     188             q^default_language/.{2}/en^, \$self->{'default_language'},
    103189             q^first/.*/^, \$self->{'first'},
    104190             q^extract_date^, \$self->{'date_extract'},
    105              "maximum_date/\\d{4}/$year", \$self->{'max_year'},
     191             qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
    106192             q^no_bibliography^, \$self->{'no_biblio'},
    107              "maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1",
    108              \$self->{'max_century'},
     193             qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
    109194             "allow_extra_options")) {
    110195   
     
    145230    $self->{'block_exp'} = $self->get_default_block_exp ();
    146231    }
    147    
    148     # handle input_encoding aliases
    149     $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
    150     $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
    151232}
    152233
     
    204285    }
    205286
     287    my $outhandle = $self->{'outhandle'};
     288
    206289    my $filename = &util::filename_cat($base_dir, $file);
    207290    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
     
    211294    my $plugin_name = ref ($self);
    212295    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    213    
     296
     297    my ($language, $encoding);
     298    if ($self->{'input_encoding'} eq "auto") {
     299    # use textcat to automatically work out the input encoding and language
     300    ($language, $encoding) = $self->get_language_encoding ($filename);
     301
     302    } elsif ($self->{'extract_language'}) {
     303    # use textcat to get language metadata
     304    ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
     305    $encoding = $self->{'input_encoding'};
     306
     307    if ($extracted_encoding != $encoding && $self->{'verbosity'}) {
     308        print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
     309        print $outhandle "appears to be encoded as $extracted_encoding.";
     310    }
     311
     312    } else {
     313    $language = $self->{'default_language'};
     314    $encoding = $self->{'input_encoding'};
     315    }
     316
    214317    # create a new document
    215318    my $doc_obj = new doc ($filename, "indexed_doc");
     319    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
     320    $doc_obj->set_source_encoding ($encoding);
     321   
    216322   
    217323    # read in file ($text will be in utf8)
    218324    my $text = "";
    219     $self->read_file ($filename, \$text);
    220 
    221     if ($text !~ /\w/) {
    222     my $outhandle = $self->{'outhandle'};
     325    $self->read_file ($filename, $encoding, \$text);
     326
     327    if (!length ($text)) {
    223328    print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
    224329    return 0;
     
    260365sub read_file {
    261366    my $self = shift (@_);
    262     my ($filename, $textref) = @_;
     367    my ($filename, $encoding, $textref) = @_;
    263368
    264369    if (!-r $filename)
    265370    {
    266     print STDERR "Read permission denied for $filename\n";
     371    my $outhandle = $self->{'outhandle'};
     372    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
    267373    return;
    268374    }
     
    272378    open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
    273379
    274     if ($self->{'input_encoding'} eq "ascii") {
     380    if ($encoding eq "ascii") {
    275381    undef $/;
    276382    $$textref = <FILE>;
     
    279385    my $reader = new multiread();
    280386    $reader->set_handle ('BasPlug::FILE');
    281     $reader->set_encoding ($self->{'input_encoding'});
     387    $reader->set_encoding ($encoding);
    282388    $reader->read_file ($textref);
    283389
    284     if ($self->{'input_encoding'} eq "gb") {
     390    if ($encoding eq "gb") {
    285391        # segment the Chinese words
    286392        $$textref = &cnseg::segment($$textref);
     
    289395
    290396    close FILE;
     397}
     398
     399# Uses textcat to work out the encoding and language of the text in
     400# $filename. All html tags are removed before processing.
     401# returns an array containing "language" and "encoding"
     402sub get_language_encoding {
     403    my $self = shift (@_);
     404    my ($filename) = @_;
     405    my $outhandle = $self->{'outhandle'};
     406
     407    # read in file
     408    open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
     409    undef $/;
     410    my $text = <FILE>;
     411    $/ = "\n";
     412    close FILE;
     413
     414    # remove all HTML tags
     415    $text =~ s/<[^>]*>//sg;
     416
     417    # get the language/encoding
     418    my @results = textcat::classify($text);
     419
     420#    foreach $i (@results) {
     421#   print STDERR "i: $i\n";
     422#    }
     423
     424    if (scalar @results != 1) {
     425    if ($self->{'input_encoding'} ne 'auto') {
     426        if ($self->{'extract_language'} && $self->{'verbosity'}) {
     427        print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
     428        print $outhandle "defaulting to $self->{'default_language'}\n";
     429        }       
     430        return ($self->{'default_language'}, $self->{'input_encoding'});
     431
     432    } else {
     433        if ($self->{'verbosity'}) {
     434        print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
     435        print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
     436        }
     437        return ($self->{'default_language'}, $self->{'default_encoding'});
     438    }
     439    }
     440
     441    # format language/encoding
     442    my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
     443    $language = $iso639::toiso639{lc($language)};
     444    die "Invalid language\n" if !defined $language;
     445
     446    if (!defined $encoding) {
     447    # if textcat returned no encoding info it is assumed to be iso_8859_1
     448    $encoding = "iso_8859_1";
     449    } else {
     450    # convert to the format we expect
     451    $encoding =~ s/windows/windows_/;
     452    $encoding =~ s/iso8859/iso_8859/;
     453    $encoding =~ s/^gb.*$/gb/;
     454    }
     455
     456    if (!defined $supported_encodings{$encoding}) {
     457    if ($self->{'verbosity'}) {
     458        print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
     459        print $outhandle "using $self->{'default_encoding'}\n";
     460    }
     461    $encoding = $self->{'default_encoding'};
     462    }
     463
     464    return ($language, $encoding);
    291465}
    292466
     
    351525
    352526    print $outhandle " extracting email addresses ...\n"
    353     if ($self->{'verbosity'} >= 2);
     527    if ($self->{'verbosity'} > 2);
    354528   
    355529    my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g);
     
    362536        $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
    363537        print $outhandle "  extracting $address\n"
    364         if ($self->{'verbosity'} >= 3);
     538        if ($self->{'verbosity'} > 3);
    365539    }
    366540    }
    367541    print $outhandle " done extracting email addresses.\n"
    368     if ($self->{'verbosity'} >= 2);
     542    if ($self->{'verbosity'} > 2);
    369543
    370544}
     
    437611}
    438612
    439 
    440 # Identify the language of a section and add it to the metadata
    441 sub extract_language {
    442     my $self = shift (@_);
    443     my ($textref, $doc_obj, $thissection) = @_;
    444 
    445     # remove all HTML tags
    446     my $text = $$textref;
    447     $text =~ s/<P[^>]*>/\n/sgi;
    448     $text =~ s/<H[^>]*>/\n/sgi;
    449     $text =~ s/<[^>]*>//sgi;
    450     $text =~ tr/\n/\n/s;
    451 
    452     # get the language
    453     my @results = textcat::classify($text);
    454     @results = ("unknown") if ($#results > 2);
    455 
    456     # create language string and remove encoding information
    457     my $language = join(" or ", @results);
    458     $language =~ s/\-\w+//g;
    459     $doc_obj->add_utf8_metadata($thissection, "Language",  $language);
    460     # print "Language: ", time, "-> $language\n";
    461 
    462 }
    463 
    464613# extract acronyms from a section in a document. progress is
    465614# reported to outhandle based on the verbosity. both the Acronym
     
    472621
    473622    print $outhandle " extracting acronyms ...\n"
    474     if ($self->{'verbosity'} >= 2);
     623    if ($self->{'verbosity'} > 2);
    475624
    476625    my $acro_array =  &acronym::acronyms($textref);
     
    496645        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
    497646        print $outhandle "  adding ". $acro->to_string() . "\n"
    498         if ($self->{'verbosity'} >= 3);
     647        if ($self->{'verbosity'} > 3);
    499648       
    500649    }
    501650    }
    502651    print $outhandle " done extracting acronyms. \n"
    503     if ($self->{'verbosity'} >= 2);
     652    if ($self->{'verbosity'} > 2);
    504653}
    505654
     
    510659
    511660    print $outhandle " marking up acronyms ...\n"
    512     if ($self->{'verbosity'} >= 2);
     661    if ($self->{'verbosity'} > 2);
    513662
    514663    #self is passed in to check for verbosity ...
     
    516665
    517666    print $outhandle " done marking up acronyms. \n"
    518     if ($self->{'verbosity'} >= 2);
     667    if ($self->{'verbosity'} > 2);
    519668
    520669    return $text;
     
    522671
    5236721;
    524 
    525 
    526 
Note: See TracChangeset for help on using the changeset viewer.