Changeset 1844


Ignore:
Timestamp:
2001-01-19T10:35:13+13:00 (23 years ago)
Author:
sjboddie
Message:

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

Location:
trunk/gsdl/perllib
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/doc.pm

    r1732 r1844  
    130130}
    131131
     132sub set_source_encoding {
     133    my $self = shift (@_);
     134    my ($source_encoding) = @_;
     135
     136    $self->set_metadata_element ($self->get_top_section(),
     137                 "gsdlsourceencoding",
     138                 $source_encoding);
     139}
     140
     141# returns the source_encoding as it was provided
     142sub get_source_encoding {
     143    my $self = shift (@_);
     144   
     145    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourceencoding");
     146}
     147
    132148sub _escape_text {
    133149    my ($text) = @_;
  • trunk/gsdl/perllib/docsave.pm

    r1454 r1844  
    239239                     "gsdlassocfile",
    240240                     "$afile:$assoc_file->[2]:$dir");
    241     } else {
     241    } elsif ($self->{'verbosity'} > 2) {
    242242        print $outhandle "docsave::process couldn't copy the associated file " .
    243243        "$assoc_file->[0] to $afile\n";
  • trunk/gsdl/perllib/multiread.pm

    r1838 r1844  
    2626# encodings currently supported are
    2727#
    28 # utf8         - either utf8 or unicode (automatically detected)
    29 # unicode      - just unicode (doesn't currently do endian detection)
    30 # gb           - GB
    31 # iso_8859_1   - extended ascii (iso-8859-1)
    32 # iso_8859_6   - 8 bit arabic (iso-8859-6)
    33 # windows_1256 - Windows codepage 1256 (Arabic)
    34 # windows_1251 - Windows codepage 1251 (Cyrillic)
     28# utf8             - either utf8 or unicode (automatically detected)
     29# unicode          - just unicode (doesn't currently do endian detection)
     30# gb               - GB
     31# iso_8859_[1-9]   - 8 bit extended ascii encodings
     32# windows_125[0-6] - Windows codepages 1250 to 1256
    3533
    3634package multiread;
     
    172170
    173171    if ($self->{'encoding'} eq "iso_8859_1") {
    174     # Latin 1 extended ascii (ISO-8859-1)
     172    # special case for iso_8859_1 as &ascii2utf8($char) is faster than
     173    # &unicode2utf8(iso2unicode('1', $char))
    175174    return undef if (eof ($handle));
    176175    return &unicode::ascii2utf8 (getc ($handle));
    177176    }
    178 
    179     if ($self->{'encoding'} eq "iso_8859_6") {
    180     # 8 bit Arabic (IOS-8859-6)
    181     return undef if (eof ($handle));
    182     return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
    183     }
    184 
    185     if ($self->{'encoding'} eq "windows_1256") {
    186     # Windows 1256 (Arabic)
    187     return undef if (eof ($handle));
    188     return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
    189     }
    190 
    191     if ($self->{'encoding'} eq "windows_1251") {
    192     # Windows 1251 (Cyrillic)
    193     return undef if (eof ($handle));
    194     return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", getc ($handle)));
     177   
     178    if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
     179    return undef if (eof ($handle));
     180    return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle)));
     181    }
     182
     183    if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
     184    return undef if (eof ($handle));
     185    return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle)));
     186    }
     187
     188    if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
     189    return undef if (eof ($handle));
     190    return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle)));
    195191    }
    196192
     
    257253   
    258254    if ($self->{'encoding'} eq "iso_8859_1") {
    259     # extended ascii (ISO-8859-1)
     255    # special case for iso_8859_1 as &ascii2utf8($line) is faster than
     256    # &unicode2utf8(iso2unicode('1', $line))
    260257    my $line = "";
    261258    if (defined ($line = <$handle>)) {
     
    265262    }
    266263   
    267     if ($self->{'encoding'} eq "iso_8859_6") {
    268     # 8 bit arabic (ISO-8859-6)
    269     my $line = "";
    270     if (defined ($line = <$handle>)) {
    271         return &unicode::unicode2utf8(&unicode::arabic2unicode ($line));
    272     }
    273     return undef;
    274     }
    275    
    276     if ($self->{'encoding'} eq "windows_1256") {
    277     # Windows 1256 (Arabic)
    278     my $line = "";
    279     if (defined ($line = <$handle>)) {
    280         return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
    281     }
    282     return undef;
    283     }
    284    
    285     if ($self->{'encoding'} eq "windows_1251") {
    286     # Windows 1251 (Cyrillic)
    287     my $line = "";
    288     if (defined ($line = <$handle>)) {
    289         return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $line));
     264    if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
     265    my $line = "";
     266    if (defined ($line = <$handle>)) {
     267        return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line));
     268    }
     269    return undef;
     270    }
     271
     272    if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
     273    my $line = "";
     274    if (defined ($line = <$handle>)) {
     275        return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line));
     276    }
     277    return undef;
     278    }
     279
     280    if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
     281    my $line = "";
     282    if (defined ($line = <$handle>)) {
     283        return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line));
    290284    }
    291285    return undef;
     
    339333   
    340334    if ($self->{'encoding'} eq "iso_8859_1") {
     335    # special case for iso_8859_1 as &ascii2utf8($text) is faster than
     336    # &unicode2utf8(iso2unicode('1', $text))
    341337    undef $/;
    342338    my $text = <$handle>;
     
    346342    }
    347343   
    348     if ($self->{'encoding'} eq "iso_8859_6") {
    349     my $text = <$handle>;
    350     undef $/;
    351     $/ = "\n";
    352     $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
    353     return;
    354     }
    355 
    356     if ($self->{'encoding'} eq "windows_1256") {
    357     undef $/;
    358     my $text = <$handle>;
    359     $/ = "\n";
    360     $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
    361     return;
    362     }
    363 
    364     if ($self->{'encoding'} eq "windows_1251") {
    365     undef $/;
    366     my $text = <$handle>;
    367     $/ = "\n";
    368     $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $text));
    369     return;
    370     }
    371 }
    372 
     344    if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
     345    undef $/;
     346    my $text = <$handle>;
     347    $/ = "\n";
     348    $$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text));
     349    return;
     350    }
     351
     352    if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
     353    undef $/;
     354    my $text = <$handle>;
     355    $/ = "\n";
     356    $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text));
     357    return;
     358    }
     359
     360    if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
     361    undef $/;
     362    my $text = <$handle>;
     363    $/ = "\n";
     364    $$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text));
     365    return;
     366    }
     367}
    373368
    3743691;
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1838 r1844  
    3434use diagnostics;
    3535use DateExtract;
     36use iso639;
     37
     38# if textcat returns an encoding that isn't in this list
     39# we'll print a warning and use the default encoding instead
     40%supported_encodings = (
     41            "ascii" => "",
     42            "iso_8859_1" => "",
     43            "windows_1252" => "",
     44            "iso_8859_2" => "",
     45            "windows_1250" => "",
     46            "iso_8859_3" => "",
     47            "iso_8859_4" => "",
     48            "iso_8859_5" => "",
     49            "windows_1251" => "",
     50            "koi8_r" => "",
     51            "koi8_u" => "",
     52            "iso_8859_6" => "",
     53            "windows_1256" => "",
     54            "iso_8859_7" => "",
     55            "windows_1253" => "",
     56            "iso_8859_8" => "",
     57            "windows_1255" => "",
     58            "iso_8859_9" => "",
     59            "windows_1254" => "",
     60            "gb" => ""
     61            );
    3662
    3763sub print_general_usage {
     
    3965
    4066    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
    41     print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
    42     print STDERR "                     converted from these encodings and stored internally as\n";
    43     print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n";
    44     print STDERR "                     are:\n";
    45     print STDERR "                        iso_8859_1 (extended ascii)\n";
    46     print STDERR "                        Latin1 (the same as iso-8859-1)\n";
    47     print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
    48     print STDERR "                               conversion is neccessary)\n";
    49     print STDERR "                        gb (GB or GBK simplified Chinese)\n";
    50     print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
    51     print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
    52     print STDERR "                        Arabic (the same as windows_1256)\n";
    53     print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
    54     print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
    55     print STDERR "                                 detection)\n";
    56     print STDERR "                        windows_1251 (Windows codepage 1251 (Cyrillic))\n";
     67
    5768    print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
    5869    print STDERR "                     Matching filenames will be processed by this plugin.\n";
    5970    print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
    6071    print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
    61     print STDERR "                     .htm or .html (case-insensitive).\n";
     72    print STDERR "                     .htm or .html (case-insensitive).\n\n";
     73
    6274    print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
    6375    print STDERR "                     being passed to any further plugins in the list. This has no\n";
     
    6678    print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
    6779    print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n";
    68     print STDERR "                     file extensions.\n";
     80    print STDERR "                     file extensions.\n\n";
     81
     82
     83    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
     84    print STDERR "                     converted from these encodings and stored internally as\n";
     85    print STDERR "                     utf8. The default input_encoding is 'auto'. Accepted values\n";
     86    print STDERR "                     are:\n";
     87
     88    print STDERR "                       auto: Use text categorization algorithm to automatically\n";
     89    print STDERR "                         identify the encoding of each source document. This\n";
     90    print STDERR "                         will be slower than explicitly setting the encoding\n";
     91    print STDERR "                         but will work where more than one encoding is used\n";
     92    print STDERR "                         within the same collection.\n";
     93
     94    print STDERR "                       ascii: Plain 7 bit ascii. This may be a little faster than\n";
     95    print STDERR "                         using iso_8859_1. Beware of using 'ascii' on a collection\n";
     96    print STDERR "                         of documents that may contain characters outside of plain\n";
     97    print STDERR "                         7 bit ascii though (e.g. German or French documents\n";
     98    print STDERR "                         containing accents), use iso_8859_1 instead.\n";
     99
     100    print STDERR "                       utf8: either utf8 or unicode -- automatically detected\n";
     101    print STDERR "                       unicode: just unicode\n";
     102
     103    print STDERR "                       iso_8859_1: Latin1 (western european languages)\n";
     104    print STDERR "                       windows_1252: Windows codepage 1252 (WinLatin1)\n";
     105
     106    print STDERR "                       iso_8859_2: Latin2 (central and eastern european languages)\n";
     107    print STDERR "                       windows_1250: Windows codepage 1250 (WinLatin2)\n";
     108
     109    print STDERR "                       iso_8859_3: Latin3\n";
     110
     111    print STDERR "                       iso_8859_4: Latin4\n";
     112
     113    print STDERR "                       iso_8859_5: Cyrillic\n";
     114    print STDERR "                       windows_1251: Windows codepage 1251 (WinCyrillic)\n";
     115    print STDERR "                       koi8_r: Cyrillic - Russian\n";
     116    print STDERR "                       koi8_u: Cyrillic - Ukrainian\n";
     117
     118    print STDERR "                       iso_8859_6: Arabic\n";
     119    print STDERR "                       windows_1256: Windows codepage 1256 (WinArabic)\n";
     120
     121    print STDERR "                       iso_8859_7: Greek\n";
     122    print STDERR "                       windows_1253: Windows codepage 1253 (WinGreek)\n";
     123
     124    print STDERR "                       iso_8859_8: Hebrew\n";
     125    print STDERR "                       windows_1255: Windows codepage 1255 (WinHebrew)\n";
     126
     127    print STDERR "                       iso_8859_9: Latin5\n";
     128    print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n";
     129
     130    print STDERR "                       gb: GB or GBK simplified Chinese\n\n";
     131
     132    print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
     133    print STDERR "                     algorithm fails to extract the encoding or extracts an encoding\n";
     134    print STDERR "                     that is not supported by Greenstone, this encoding will be used\n";
     135    print STDERR "                     instead. The default is iso_8859_1\n\n";
     136
     137    print STDERR "   -extract_language Identify the language of each document and set 'Language' metadata. Note\n";
     138    print STDERR "                     that this will be done automatically if -input_encoding is 'auto'.\n";
     139    print STDERR "   -default_language If Greenstone fails to work out what language a document is the\n";
     140    print STDERR "                     'Language' metadata element will be set to this value. The default\n";
     141    print STDERR "                     is 'en' (ISO 639 language symbols should be used - en = English).\n";
     142    print STDERR "                     Note that if -input_encoding is not set to 'auto' and -extract_language\n";
     143    print STDERR "                     is not set, all documents will have their 'Language' metadata set to\n";
     144    print STDERR "                     this value.\n\n";
     145
    69146    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
    70     print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n";
    71     print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
    72     print STDERR "   -first            Comma seperated list of first sizes to extract from the text \n";
    73     print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n";
     147
     148    print STDERR "   -markup_acronyms  Add acronym metadata into document text\n\n";
     149
     150    print STDERR "   -first            Comma seperated list of first sizes to extract from the text\n";
     151    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n\n";
     152
    74153    print STDERR "   -extract_email    Extract email addresses as metadata\n\n";
     154
    75155    print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n\n";
    76156}
     
    86166    my $class = shift (@_);
    87167    my $plugin_name = shift (@_);
    88 
    89168    my $self = {};
    90     my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode|windows_1251)\$";
     169   
     170    my $enc = "^(";
     171    map {$enc .= "|$_";} keys %supported_encodings;
     172    my $denc = $enc . "|utf8|unicode)\$";
     173    $enc .= "|utf8|unicode|auto)\$";
     174   
    91175    $self->{'outhandle'} = STDERR;
    92176    my $year = (localtime)[5]+1900;
     
    94178    # general options available to all plugins
    95179    if (!parsargv::parse(\@_,
    96              qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
    97180             q^process_exp/.*/^, \$self->{'process_exp'},
    98181             q^block_exp/.*/^, \$self->{'block_exp'},
     182             qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
     183             qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
    99184             q^extract_acronyms^, \$self->{'extract_acronyms'},
    100185             q^extract_email^, \$self->{'extract_email'},
    101186             q^markup_acronyms^, \$self->{'markup_acronyms'},
    102187             q^extract_language^, \$self->{'extract_language'},
     188             q^default_language/.{2}/en^, \$self->{'default_language'},
    103189             q^first/.*/^, \$self->{'first'},
    104190             q^extract_date^, \$self->{'date_extract'},
    105              "maximum_date/\\d{4}/$year", \$self->{'max_year'},
     191             qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
    106192             q^no_bibliography^, \$self->{'no_biblio'},
    107              "maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1",
    108              \$self->{'max_century'},
     193             qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
    109194             "allow_extra_options")) {
    110195   
     
    145230    $self->{'block_exp'} = $self->get_default_block_exp ();
    146231    }
    147    
    148     # handle input_encoding aliases
    149     $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
    150     $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
    151232}
    152233
     
    204285    }
    205286
     287    my $outhandle = $self->{'outhandle'};
     288
    206289    my $filename = &util::filename_cat($base_dir, $file);
    207290    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
     
    211294    my $plugin_name = ref ($self);
    212295    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    213    
     296
     297    my ($language, $encoding);
     298    if ($self->{'input_encoding'} eq "auto") {
     299    # use textcat to automatically work out the input encoding and language
     300    ($language, $encoding) = $self->get_language_encoding ($filename);
     301
     302    } elsif ($self->{'extract_language'}) {
     303    # use textcat to get language metadata
     304    ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
     305    $encoding = $self->{'input_encoding'};
     306
     307    if ($extracted_encoding != $encoding && $self->{'verbosity'}) {
     308        print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
     309        print $outhandle "appears to be encoded as $extracted_encoding.";
     310    }
     311
     312    } else {
     313    $language = $self->{'default_language'};
     314    $encoding = $self->{'input_encoding'};
     315    }
     316
    214317    # create a new document
    215318    my $doc_obj = new doc ($filename, "indexed_doc");
     319    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
     320    $doc_obj->set_source_encoding ($encoding);
     321   
    216322   
    217323    # read in file ($text will be in utf8)
    218324    my $text = "";
    219     $self->read_file ($filename, \$text);
    220 
    221     if ($text !~ /\w/) {
    222     my $outhandle = $self->{'outhandle'};
     325    $self->read_file ($filename, $encoding, \$text);
     326
     327    if (!length ($text)) {
    223328    print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
    224329    return 0;
     
    260365sub read_file {
    261366    my $self = shift (@_);
    262     my ($filename, $textref) = @_;
     367    my ($filename, $encoding, $textref) = @_;
    263368
    264369    if (!-r $filename)
    265370    {
    266     print STDERR "Read permission denied for $filename\n";
     371    my $outhandle = $self->{'outhandle'};
     372    print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
    267373    return;
    268374    }
     
    272378    open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
    273379
    274     if ($self->{'input_encoding'} eq "ascii") {
     380    if ($encoding eq "ascii") {
    275381    undef $/;
    276382    $$textref = <FILE>;
     
    279385    my $reader = new multiread();
    280386    $reader->set_handle ('BasPlug::FILE');
    281     $reader->set_encoding ($self->{'input_encoding'});
     387    $reader->set_encoding ($encoding);
    282388    $reader->read_file ($textref);
    283389
    284     if ($self->{'input_encoding'} eq "gb") {
     390    if ($encoding eq "gb") {
    285391        # segment the Chinese words
    286392        $$textref = &cnseg::segment($$textref);
     
    289395
    290396    close FILE;
     397}
     398
     399# Uses textcat to work out the encoding and language of the text in
     400# $filename. All html tags are removed before processing.
     401# returns an array containing "language" and "encoding"
     402sub get_language_encoding {
     403    my $self = shift (@_);
     404    my ($filename) = @_;
     405    my $outhandle = $self->{'outhandle'};
     406
     407    # read in file
     408    open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
     409    undef $/;
     410    my $text = <FILE>;
     411    $/ = "\n";
     412    close FILE;
     413
     414    # remove all HTML tags
     415    $text =~ s/<[^>]*>//sg;
     416
     417    # get the language/encoding
     418    my @results = textcat::classify($text);
     419
     420#    foreach $i (@results) {
     421#   print STDERR "i: $i\n";
     422#    }
     423
     424    if (scalar @results != 1) {
     425    if ($self->{'input_encoding'} ne 'auto') {
     426        if ($self->{'extract_language'} && $self->{'verbosity'}) {
     427        print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
     428        print $outhandle "defaulting to $self->{'default_language'}\n";
     429        }       
     430        return ($self->{'default_language'}, $self->{'input_encoding'});
     431
     432    } else {
     433        if ($self->{'verbosity'}) {
     434        print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
     435        print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
     436        }
     437        return ($self->{'default_language'}, $self->{'default_encoding'});
     438    }
     439    }
     440
     441    # format language/encoding
     442    my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
     443    $language = $iso639::toiso639{lc($language)};
     444    die "Invalid language\n" if !defined $language;
     445
     446    if (!defined $encoding) {
     447    # if textcat returned no encoding info it is assumed to be iso_8859_1
     448    $encoding = "iso_8859_1";
     449    } else {
     450    # convert to the format we expect
     451    $encoding =~ s/windows/windows_/;
     452    $encoding =~ s/iso8859/iso_8859/;
     453    $encoding =~ s/^gb.*$/gb/;
     454    }
     455
     456    if (!defined $supported_encodings{$encoding}) {
     457    if ($self->{'verbosity'}) {
     458        print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
     459        print $outhandle "using $self->{'default_encoding'}\n";
     460    }
     461    $encoding = $self->{'default_encoding'};
     462    }
     463
     464    return ($language, $encoding);
    291465}
    292466
     
    351525
    352526    print $outhandle " extracting email addresses ...\n"
    353     if ($self->{'verbosity'} >= 2);
     527    if ($self->{'verbosity'} > 2);
    354528   
    355529    my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g);
     
    362536        $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
    363537        print $outhandle "  extracting $address\n"
    364         if ($self->{'verbosity'} >= 3);
     538        if ($self->{'verbosity'} > 3);
    365539    }
    366540    }
    367541    print $outhandle " done extracting email addresses.\n"
    368     if ($self->{'verbosity'} >= 2);
     542    if ($self->{'verbosity'} > 2);
    369543
    370544}
     
    437611}
    438612
    439 
    440 # Identify the language of a section and add it to the metadata
    441 sub extract_language {
    442     my $self = shift (@_);
    443     my ($textref, $doc_obj, $thissection) = @_;
    444 
    445     # remove all HTML tags
    446     my $text = $$textref;
    447     $text =~ s/<P[^>]*>/\n/sgi;
    448     $text =~ s/<H[^>]*>/\n/sgi;
    449     $text =~ s/<[^>]*>//sgi;
    450     $text =~ tr/\n/\n/s;
    451 
    452     # get the language
    453     my @results = textcat::classify($text);
    454     @results = ("unknown") if ($#results > 2);
    455 
    456     # create language string and remove encoding information
    457     my $language = join(" or ", @results);
    458     $language =~ s/\-\w+//g;
    459     $doc_obj->add_utf8_metadata($thissection, "Language",  $language);
    460     # print "Language: ", time, "-> $language\n";
    461 
    462 }
    463 
    464613# extract acronyms from a section in a document. progress is
    465614# reported to outhandle based on the verbosity. both the Acronym
     
    472621
    473622    print $outhandle " extracting acronyms ...\n"
    474     if ($self->{'verbosity'} >= 2);
     623    if ($self->{'verbosity'} > 2);
    475624
    476625    my $acro_array =  &acronym::acronyms($textref);
     
    496645        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
    497646        print $outhandle "  adding ". $acro->to_string() . "\n"
    498         if ($self->{'verbosity'} >= 3);
     647        if ($self->{'verbosity'} > 3);
    499648       
    500649    }
    501650    }
    502651    print $outhandle " done extracting acronyms. \n"
    503     if ($self->{'verbosity'} >= 2);
     652    if ($self->{'verbosity'} > 2);
    504653}
    505654
     
    510659
    511660    print $outhandle " marking up acronyms ...\n"
    512     if ($self->{'verbosity'} >= 2);
     661    if ($self->{'verbosity'} > 2);
    513662
    514663    #self is passed in to check for verbosity ...
     
    516665
    517666    print $outhandle " done marking up acronyms. \n"
    518     if ($self->{'verbosity'} >= 2);
     667    if ($self->{'verbosity'} > 2);
    519668
    520669    return $text;
     
    522671
    5236721;
    524 
    525 
    526 
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r1699 r1844  
    377377            $value =~ s/\"$//;
    378378            $value =~ s/\s+/ /gs;
    379             print "adding Creator of $value\n";
    380379            $doc_obj->add_utf8_metadata($section, "Creator", $value);
    381380            print $outhandle " extracted Creator metadata \"$value\"\n"
    382                 if ($self->{'verbosity'} >= 2);
     381                if ($self->{'verbosity'} > 2);
    383382            next;
    384383            }
     
    405404            $doc_obj->add_utf8_metadata($section, $field, $value);
    406405            print $outhandle " extracted \"$field\" metadata \"$value\"\n"
    407             if ($self->{'verbosity'} >= 2);
     406            if ($self->{'verbosity'} > 2);
    408407            next;
    409408        }
     
    427426            $doc_obj->add_utf8_metadata ($section, $field, $title);
    428427            print $outhandle " extracted \"$field\" metadata \"$title\"\n"
    429                 if ($self->{'verbosity'} >= 2);
     428                if ($self->{'verbosity'} > 2);
    430429            next;
    431430            }
     
    445444        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
    446445        print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
    447         if ($self->{'verbosity'} >= 2);
     446        if ($self->{'verbosity'} > 2);
    448447        next;
    449448    }
     
    467466            $doc_obj->add_utf8_metadata ($section, $tag, $word);
    468467            print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
    469                 if ($self->{'verbosity'} >= 2);
     468                if ($self->{'verbosity'} > 2);
    470469            }
    471470        }
  • trunk/gsdl/perllib/unicode.pm

    r1227 r1844  
    4949}
    5050
    51 # arabic2unicode takes an 8 bit Arabic string (ISO-8859-6)
    52 # and returns a unicode array
    53 sub arabic2unicode {
    54     my ($in) = @_;
    55     my $out = [];
    56 
    57     my $i = 0;
    58     my $len = length($in);
    59     while ($i < $len) {
    60     my $c = ord(substr ($in, $i, 1));
    61     $c += (1567-191) if ($c >= 0x80);
    62     push (@$out, $c);
    63     $i++;
    64     }
    65 
    66     return $out;
    67 }
    68 
    69 # windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
     51# windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
    7052# and returns a unicode array. These encodings are similar to but not
    7153# identical to the corresponding ISO-8859 encodings.
     54#
     55# $encoding should be the code page name (e.g. '1252')
    7256#
    7357# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
     
    7862    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
    7963                      "WINDOWS", "$encoding.TXT");
     64    return $out unless &loadmapping ($encoding, $mapfile);
     65
     66    my $i = 0;
     67    my $len = length($in);
     68    while ($i < $len) {
     69    my $c = ord(substr ($in, $i, 1));
     70    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
     71    push (@$out, $c);
     72    $i++;
     73    }
     74
     75    return $out;
     76}
     77
     78# iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
     79# and returns a unicode array. This function is much like windows2unicode()
     80# except that only characters >= 0xA0 are read from the mapping file (since
     81# all characters below that are the same for all iso-8859 character sets
     82# and therefore already the same as unicode).
     83#
     84# Note that while this function will work for iso-8859-1 (latin 1) it'll be
     85# much faster to use ascii2unicode() or ascii2utf8()
     86#
     87# $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
     88# encoding is
     89#
     90# The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
     91sub iso2unicode {
     92    my ($encoding, $in) = @_;
     93    my $out = [];
     94
     95    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
     96                      "ISO_8859", "$encoding.TXT");
     97    return $out unless &loadmapping ($encoding, $mapfile);
     98
     99    my $i = 0;
     100    my $len = length($in);
     101    while ($i < $len) {
     102    my $c = ord(substr ($in, $i, 1));
     103    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
     104    push (@$out, $c);
     105    $i++;
     106    }
     107
     108    return $out;
     109}
     110
     111# cyrillic2unicode is basically identical to windows2unicode, the only
     112# difference being that the map files live in unicode/MAPPINGS/CYRILLIC
     113#
     114# values for $encoding may be 'koi8_r' or 'koi8_u'
     115sub cyrillic2unicode {
     116    my ($encoding, $in) = @_;
     117    my $out = [];
     118
     119    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
     120                      "CYRILLIC", "$encoding.txt");
    80121    return $out unless &loadmapping ($encoding, $mapfile);
    81122
Note: See TracChangeset for help on using the changeset viewer.