Changeset 1870


Ignore:
Timestamp:
2001-01-29T14:54:58+13:00 (23 years ago)
Author:
sjboddie
Message:

Tidied up language support stuff.

Location:
trunk/gsdl
Files:
10 added
3 deleted
17 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/makemapfile.pl

    r1868 r1870  
    2626###########################################################################
    2727
    28 # Creates a binary map file for use by complex character encodings
    29 # (e.g. CJK encodings like GBK and Shift-JIS). The map file is written to
    30 # the $GSDLHOME/unicode directory.
    31 
    3228BEGIN {
    3329    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
     
    3733
    3834use parsargv;
    39 use cjk;
     35use util;
     36
     37# %translations is of the form:
     38#
     39# encodings{encodingname-encodingname}->blocktranslation
     40# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
     41#
     42# Any of the top translation blocks can point to an undefined
     43# value. This data structure aims to allow fast translation and
     44# efficient storage.
     45%translations = ();
     46
     47# @array256 is used for initialisation, there must be
     48# a better way...
     49@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     50         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     51         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     52         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     53         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     54         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     55         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     56         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     57         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     58         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     59         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     60         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     61         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     62         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     63         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     64         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
     65
    4066&main();
    4167
     
    5581    }
    5682
    57     &cjk::makeencodingmapfile ($encoding, $mapfile);
     83    if (!&loadencoding ($encoding, $mapfile)) {
     84    die "couldn't load encoding $encoding";
     85    }
     86
     87    # write out map files
     88    &writemapfile ("$encoding-unicode", $encoding, 1);
     89    &writemapfile ("unicode-$encoding", $encoding, 0);
    5890}
     91
     92sub writemapfile {
     93    my ($encoding, $filename, $tounicode) = @_;
     94
     95    $filename .= ".ump"; # unicode map file
     96    if ($tounicode) {
     97    $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename);
     98    } else {
     99    $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename);
     100    }
     101
     102    die "translation not defined" if (!defined $translations{$encoding});
     103    my $block = $translations{$encoding};
     104
     105    print "writing $filename\n";
     106    open (MAPFILE, ">" . $filename) || die;
     107    binmode (MAPFILE);
     108
     109    my ($i, $j);
     110    for ($i=0; $i<256; $i++) {
     111    if (ref ($block->[$i]) eq "ARRAY") {
     112        print MAPFILE pack ("C", $i);
     113        for ($j=0; $j<256; $j++) {
     114        # unsigned short in network order
     115        print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256),
     116                    $block->[$i]->[$j] % 256);
     117        }
     118    }
     119    }
     120    close (MAPFILE);
     121}
     122
     123# loadencoding expects the mapfile to contain (at least) two
     124# tab-separated fields. The first field is the mapped value
     125# and the second field is the unicode value.
     126#
     127# It returns 1 if successful, 0 if unsuccessful
     128sub loadencoding {
     129    my ($encoding, $mapfile) = @_;
     130   
     131    my $to = "$encoding-unicode";
     132    my $from = "unicode-$encoding";
     133
     134    # check to see if the encoding has already been loaded
     135    if (defined $translations{$to} && defined $translations{$from}) {
     136    return 1;
     137    }
     138
     139    return 0 unless open (MAPFILE, $mapfile);
     140
     141    my ($line, @line);
     142    $translations{$to} = [@array256];
     143    $translations{$from} = [@array256];
     144    while (defined ($line = <MAPFILE>)) {
     145    chomp $line;
     146    # remove comments
     147    $line =~ s/\#.*$//;
     148    next unless $line =~ /\S/;
     149
     150    # split the line into fields and do a few
     151    # simple sanity checks
     152    @line = split (/\t/, $line);
     153    next unless (scalar(@line) >= 2 &&
     154             $line[0] =~ /^0x/ &&
     155             $line[1] =~ /^0x/);
     156
     157    my $char = hex($line[0]);
     158    my $unic = hex($line[1]);
     159
     160    # might need this for some versions of gb but not gbk
     161#   $char = $char | 0x8080 unless ($encoding =~ /gbk/i);
     162
     163    &addchartrans ($translations{$to}, $char, $unic);
     164    &addchartrans ($translations{$from}, $unic, $char);
     165    }
     166
     167    close (MAPFILE);
     168
     169    return 1;
     170}
     171
     172# addchartrans adds one character translation to a translation block.
     173# It also simplifies the translation block if possible.
     174sub addchartrans {
     175    my ($block, $from, $to) = @_;
     176    my $i = 0;
     177
     178    my $high = ($from / 256) % 256;
     179    my $low = $from % 256;
     180
     181    if (ref ($block->[$high]) ne "ARRAY") {
     182    $block->[$high] = [@array256];
     183    }
     184    $block->[$high]->[$low] = $to;
     185}
  • trunk/gsdl/etc/main.cfg

    r1868 r1870  
    7878# Define the interface languages and encodings supported by this receptionist
    7979
    80 # An "Encoding" line defines an encoding to be used by the receptionist
    81 # options are:
    82 # shortname -- Identifier for the given encoding. The shortname option is
    83 #              mandatory and must be unique for each "Encoding" line.
     80# An "Encoding" line defines an encoding to be used by the receptionist.
     81# Uncomment "Encoding" lines to include an encoding on your "preferences" page.
     82# Encoding line options are:
     83# shortname -- The standard charset label for the given encoding. The
     84#              shortname option is mandatory.
    8485# longname  -- The display name of the given encoding. If longname isn't set
    8586#              it will default to using shortname instead.
    86 # type      -- The type of encoding. Note that for most encodings this
    87 #              value is the directory name under which the map file for
    88 #              this encoding resides in the Greenstone unicode/MAPPINGS
    89 #              directory (e.g. 'WINDOWS', 'ISO_8859' etc.). It may also
    90 #              take the values 'CJK' and 'UTF8'.
    91 # mapfile   -- The name of the map file for use when converting between
    92 #              utf8 and the given encoding. The mapfile option is mandatory
    93 #              for all encoding types with the exception of UTF8. If type
    94 #              is CJK, mapfile is the abbreviated name of the encoding as
    95 #              used by the binary mapping files (.ump files). i.e. if the
    96 #              encoding uses the map files gbku.ump and ugbk.ump, mapfile
    97 #              will be set to "gbk".
    98 # label     -- The standard label to which you must set the value of
    99 #              "charset" within http headers or html meta tags to get a web
    100 #              browser to use the given encoding. The label option is
    101 #              mandatory.
    102 Encoding shortname=utf8 "longname=Unicode (UTF-8)" type=UTF8 label=UTF-8
    103 Encoding shortname=iso1 "longname=Western (ISO-8859-1)" type=ISO_8859 mapfile=1.TXT label=ISO-8859-1
    104 Encoding shortname=w1251 "longname=Cyrillic (Windows-1251)" type=WINDOWS mapfile=1251.TXT label=windows-1251
    105 Encoding shortname=w1256 "longname=Arabic (Windows-1256)" type=WINDOWS mapfile=1256.TXT label=windows-1256
    106 Encoding shortname=w1256 "longname=Central European (Windows-1250)" type=WINDOWS mapfile=1250.TXT label=windows-1250
    107 Encoding shortname=gb "longname=Chinese Simplified (GBK)" type=CJK label=GBK mapfile=gbk
    108 Encoding shortname=sjis "longname=Japanese (Shift-JIS)" type=CJK label=shift_jis mapfile=sjis
    109 Encoding shortname=koi8r "longname=Cyrillic (KOI8-R)" type=CYRILLIC mapfile=koi8_r.txt label=koi8-r
     87# map       -- The name of the map file (i.e. the .ump file) for use when
     88#              converting between unicode and the given encoding. The map
     89#              option is mandatory for all encoding lines except the
     90#              special case for utf8.
    11091
    111 # The following encoding is not currently supported
    112 # Encoding shortname=eucjp "longname=Japanese (EUC)" type=CJK label=euc-jp mapfile=jis
     92# The utf8 encoding is handled internally and doesn't require a map file.
     93# As a rule the utf8 encoding should always be enabled, especially if you
     94# have collections of documents that may not all be in the same
     95# language/encoding.
     96Encoding shortname=utf-8 "longname=Unicode (UTF-8)"
     97
     98# The ISO-8859 series
     99Encoding shortname=iso-8859-1 "longname=Western (ISO-8859-1)" map=8859_1.ump
     100#Encoding shortname=iso-8859-2 "longname=Central European (ISO-8859-2)" map=8859_2.ump
     101#Encoding shortname=iso-8859-3 "longname=Latin 3 (ISO-8859-3)" map=8859_3.ump
     102#Encoding shortname=iso-8859-4 "longname=Latin 4 (ISO-8859-4)" map=8859_4.ump
     103#Encoding shortname=iso-8859-5 "longname=Cyrillic (ISO-8859-5)" map=8859_5.ump
     104#Encoding shortname=iso-8859-6 "longname=Arabic (ISO-8859-6)" map=8859_6.ump
     105#Encoding shortname=iso-8859-7 "longname=Greek (ISO-8859-7)" map=8859_7.ump
     106#Encoding shortname=iso-8859-8 "longname=Hebrew (ISO-8859-8)" map=8859_8.ump
     107#Encoding shortname=iso-8859-9 "longname=Turkish (ISO-8859-9)" map=8859_9.ump
     108
     109# Windows codepages
     110#Encoding shortname=windows-1250 "longname=Central European (Windows-1250)" map=win1250.ump
     111#Encoding shortname=windows-1251 "longname=Cyrillic (Windows-1251)" map=win1251.ump
     112#Encoding shortname=windows-1252 "longname=Western (Windows-1252)" map=win1252.ump
     113#Encoding shortname=windows-1253 "longname=Greek (Windows-1253)" map=win1253.ump
     114#Encoding shortname=windows-1254 "longname=Turkish (Windows-1254)" map=win1254.ump
     115#Encoding shortname=windows-1255 "longname=Hebrew (Windows-1255)" map=win1255.ump
     116Encoding shortname=windows-1256 "longname=Arabic (Windows-1256)" map=win1256.ump
     117#Encoding shortname=windows-1257 "longname=Baltic (Windows-1257)" map=win1257.ump
     118#Encoding shortname=windows-1258 "longname=Vietnamese (Windows-1258)" map=win1258.ump
     119#Encoding shortname=windows-874 "longname=Thai (Windows-874)" map=win874.ump
     120
     121# KOI8 Cyrillic encodings
     122#Encoding shortname=koi8-r "longname=Cyrillic (KOI8-R)" map=koi8_r.ump
     123#Encoding shortname=koi8-u "longname=Cyrillic (KOI8-U)" map=koi8_u.ump
     124
     125# CJK encodings (note that Shift-JIS Japanese isn't currently supported)
     126Encoding shortname=gbk "longname=Chinese Simplified (GBK)" map=gbk.ump
     127Encoding shortname=big5 "longname=Chinese Traditional (Big5)" map=big5.ump
     128Encoding shortname=euc-jp "longname=Japanese (EUC)" map=euc_jp.ump
     129Encoding shortname=euc-kr "longname=Korean (UHC)" map=uhc.ump
    113130
    114131
     
    126143#                     interface language. This should be set to the
    127144#                     "shortname" of a valid "Encoding" line
    128 Language shortname=en longname=English default_encoding=iso1
    129 Language shortname=fr longname=French default_encoding=iso1
    130 Language shortname=zh longname=Chinese default_encoding=gb
    131 Language shortname=de longname=German default_encoding=iso1
    132 Language shortname=es longname=Spanish default_encoding=iso1
    133 Language shortname=mi longname=Maori default_encoding=iso1
    134 Language shortname=ar longname=Arabic default_encoding=w1256
    135 Language shortname=pt longname=Portuguese default_encoding=iso1
    136 Language shortname=nl longname=Dutch default_encoding=iso1
     145Language shortname=en longname=English default_encoding=iso-8859-1
     146Language shortname=fr longname=French default_encoding=iso-8859-1
     147Language shortname=zh longname=Chinese default_encoding=gbk
     148Language shortname=de longname=German default_encoding=iso-8859-1
     149Language shortname=es longname=Spanish default_encoding=iso-8859-1
     150Language shortname=mi longname=Maori default_encoding=iso-8859-1
     151Language shortname=ar longname=Arabic default_encoding=windows-1256
     152Language shortname=pt longname=Portuguese default_encoding=iso-8859-1
     153Language shortname=nl longname=Dutch default_encoding=iso-8859-1
    137154
    138155
  • trunk/gsdl/lib/gsdlunicode.cpp

    r1310 r1870  
    263263// setmapfile will cause loadmapfile to be called when conversion is
    264264// needed
    265 bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
    266                  unsigned short theabsentc) {
     265bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
    267266  // check to see if the mapfile has been already loaded
    268   if (mapdata.loaded && gsdlhome == thegsdlhome &&
    269       encoding == theencoding && absentc == theabsentc)
    270     return true;
     267  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
    271268
    272269  unloadmapfile ();
    273   gsdlhome = thegsdlhome;
    274   encoding = theencoding;
     270  mapfile = themapfile;
    275271  absentc = theabsentc;
    276272 
     
    281277
    282278// loadmapfile should be called before any conversion is done
    283 bool mapconvert::loadmapfile (const text_t &thegsdlhome,
    284                   const text_t &theencoding,
     279bool mapconvert::loadmapfile (const text_t &themapfile,
    285280                  unsigned short theabsentc) {
    286281  FILE *mapfilein = (FILE *)NULL;
    287282
    288283  // check to see if the mapfile has been already loaded
    289   if (mapdata.loaded && gsdlhome == thegsdlhome &&
    290       encoding == theencoding && absentc == theabsentc)
    291     return true;
     284  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
    292285
    293286  unloadmapfile ();
    294   gsdlhome = thegsdlhome;
    295   encoding = theencoding;
     287  mapfile = themapfile;
    296288  absentc = theabsentc;
    297289
    298290  // open the map file
    299   text_t filename = filename_cat (gsdlhome, "unicode");
    300   filename = filename_cat (filename, encoding);
    301   filename += ".ump";
    302   char *cfilename = filename.getcstr();
     291  char *cfilename = mapfile.getcstr();
    303292  if (cfilename == (char *)NULL) return false;
    304293  mapfilein = fopen(cfilename, "rb");
     
    359348unsigned short mapconvert::convert (unsigned short c) {
    360349  if (!mapdata.loaded) {
    361     if (!gsdlhome.empty() && !encoding.empty() &&
    362     loadmapfile (gsdlhome, encoding, absentc)) {
     350    if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
    363351      // do nothing, successfully loaded database
    364352    } else return absentc;
  • trunk/gsdl/lib/gsdlunicode.h

    r1310 r1870  
    114114  // setmapfile will cause loadmapfile to be called when conversion is
    115115  // needed
    116   bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
    117            unsigned short theabsentc);
     116  bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
    118117
    119118  // loadmapfile should be called before any conversion is done
    120   bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
    121             unsigned short theabsentc);
     119  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
    122120  void unloadmapfile ();
    123121
     
    129127
    130128protected:
    131   text_t gsdlhome;
    132   text_t encoding;
     129  text_t mapfile;
    133130  unsigned short absentc;
    134131  mapdata_t mapdata;
     
    146143
    147144  // setmapfile will cause loadmapfile to be called when conversion is needed
    148   bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
    149             unsigned short theabsentc) {
    150     return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
     145  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
     146    return converter.setmapfile (themapfile, theabsentc);
    151147  };
    152148
    153149  // loadmapfile should be called before any conversion takes
    154150  // place
    155   bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
    156             unsigned short theabsentc) {
    157     return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
     151  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
     152    return converter.loadmapfile (themapfile, theabsentc);
    158153  };
    159154
     
    189184
    190185  // setmapfile will cause loadmapfile to be called when conversion is needed
    191   bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
    192             unsigned short theabsentc) {
    193     return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
     186  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
     187    return converter.setmapfile (themapfile, theabsentc);
    194188  };
    195189
    196190  // loadmapfile should be called before any conversion takes
    197191  // place
    198   bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
    199             unsigned short theabsentc) {
    200     return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
     192  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
     193    return converter.loadmapfile (themapfile, theabsentc);
    201194  };
    202195
  • trunk/gsdl/mappings/README

    r1868 r1870  
    1 This directory contains mapping files for converting various character
    2 encodings to and from unicode.
     1This directory and its subdirectories contain .ump mapping files for
     2converting various character encodings to and from unicode.
     3
     4To generate .ump files use a command like "makemapfile.pl -encoding
     5encodingname -mapfile textmapfile" where encodingname becomes the filename
     6of the two new .ump files and textmapfile is a plain text file containing a
     7tab separated list of the form:
     80x8167        0x201C
     9where the first column is the hexadecimal value of the encoded character
     10and the second is the hexadecimal value of it's unicode equivalent.
     11
     12
     13
     14The following .ump files were generated from their corresponding Microsoft
     15codepages. These codepages do, in some cases, differ very slightly from the
     16standards they were based on but we've used them anyway as they're so
     17extensively used on the web.
     18
     19* gbk.ump: Simplified Chinese - generated from Microsoft's codepage 936
     20* shiftjis.ump: Japanese - generated from Microsoft's codepage 932
     21* uhc.ump: UHC Korean - generated from Microsoft's codepage 949
     22* big5.ump: Traditional Chinese - generated from Microsoft's codepage 950
  • trunk/gsdl/perllib/doc.pm

    r1868 r1870  
    694694
    695695    $self->set_utf8_metadata_element ($section, $field,
    696                       &unicode::ascii2utf8($value));
     696                      &unicode::ascii2utf8(\$value));
    697697}
    698698
     
    716716
    717717    $self->add_utf8_metadata ($section, $field,
    718                   &unicode::ascii2utf8($value));
     718                  &unicode::ascii2utf8(\$value));
    719719}
    720720
     
    799799    # convert the text to UTF-8 encoded unicode characters
    800800    # and add the text
    801     $self->add_utf8_text($section, &unicode::ascii2utf8($text));
     801    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
    802802}
    803803
  • trunk/gsdl/perllib/multiread.pm

    r1868 r1870  
    2828# utf8             - either utf8 or unicode (automatically detected)
    2929# unicode          - just unicode (doesn't currently do endian detection)
    30 # gb               - GB
    31 # iso_8859_[1-9]   - 8 bit extended ascii encodings
    32 # windows_125[0-8] - Windows codepages 1250 to 1258
    33 # windows 874      - Windows codepage 874
    34 # iscii_de         - ISCII Devanagari
    35 # shift_jis        - Shift-JIS
    36 # euc_jp           - EUC encoded Japanese
    37 # uhc              - Unified Hangul Code (Korean)
     30#
     31# plus all encodings in the "encodings" package
    3832
    3933package multiread;
    4034
    4135use unicode;
    42 use cjk;
    4336
    4437sub new {
     
    203196    }
    204197
    205     if ($self->{'encoding'} eq "gb") {
    206     # GB or GBK
     198    if ($self->{'encoding'} eq "iso_8859_1") {
     199    # we'll use ascii2utf8() for this as it's faster than going
     200    # through convert2unicode()
    207201    my $line = "";
    208202    if (defined ($line = <$handle>)) {
    209         return &unicode::unicode2utf8 (&cjk::gb2unicode ($line));
    210     }
    211     return undef;
    212     }
    213    
    214     if ($self->{'encoding'} eq "iso_8859_1") {
    215     # special case for iso_8859_1 as &ascii2utf8($line) is faster than
    216     # &unicode2utf8(iso2unicode('1', $line))
    217     my $line = "";
    218     if (defined ($line = <$handle>)) {
    219         return &unicode::ascii2utf8 ($line);
    220     }
    221     return undef;
    222     }
    223    
    224     if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
    225     my $line = "";
    226     if (defined ($line = <$handle>)) {
    227         return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line));
    228     }
    229     return undef;
    230     }
    231 
    232     if ($self->{'encoding'} =~ /windows_(\d{3,4})$/) {
    233     my $line = "";
    234     if (defined ($line = <$handle>)) {
    235         return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line));
    236     }
    237     return undef;
    238     }
    239 
    240     if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
    241     my $line = "";
    242     if (defined ($line = <$handle>)) {
    243         return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line));
    244     }
    245     return undef;
    246     }
    247 
    248     if ($self->{'encoding'} eq "iscii_de") {
    249     my $line = "";
    250     if (defined ($line = <$handle>)) {
    251         return &unicode::unicode2utf8(&unicode::iscii2unicode ("Devanagari", $line));
    252     }
    253     return undef;
    254     }
    255 
    256     # unknown encoding
     203        return &unicode::ascii2utf8 (\$line);
     204    }
     205    }
     206
     207    # everything else uses unicode::convert2unicode
     208    my $line = "";
     209    if (defined ($line = <$handle>)) {
     210    return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
     211    }
     212
    257213    return undef;
    258214}
     
    292248    }
    293249
    294     if ($self->{'encoding'} eq "gb") {
     250    if ($self->{'encoding'} eq "iso_8859_1") {
     251    # we'll use ascii2utf8() for this as it's faster than going
     252    # through convert2unicode()
    295253    undef $/;
    296254    my $text = <$handle>;
    297255    $/ = "\n";
    298     $$outputref .= &unicode::unicode2utf8 (&cjk::gb2unicode ($text));
     256    $$outputref .= &unicode::ascii2utf8 (\$text);
    299257    return;
    300258    }
    301    
    302     if ($self->{'encoding'} eq "iso_8859_1") {
    303     # special case for iso_8859_1 as &ascii2utf8($text) is faster than
    304     # &unicode2utf8(iso2unicode('1', $text))
    305     undef $/;
    306     my $text = <$handle>;
    307     $/ = "\n";
    308     $$outputref .= &unicode::ascii2utf8 ($text);
    309     return;
    310     }
    311 
    312     if ($self->{'encoding'} eq "shift_jis") {
    313     undef $/;
    314     my $text = <$handle>;
    315     $/ = "\n";
    316     $$outputref .= &unicode::unicode2utf8(&cjk::sjis2unicode ($text));
    317     return;
    318     }
    319 
    320     if ($self->{'encoding'} eq "euc_jp") {
    321     undef $/;
    322     my $text = <$handle>;
    323     $/ = "\n";
    324     $$outputref .= &unicode::unicode2utf8(&cjk::eucjp2unicode ($text));
    325     return;
    326     }
    327 
    328     if ($self->{'encoding'} eq "euc_kr") {
    329     undef $/;
    330     my $text = <$handle>;
    331     $/ = "\n";
    332     $$outputref .= &unicode::unicode2utf8(&cjk::euckr2unicode ($text));
    333     return;
    334     }
    335 
    336     if ($self->{'encoding'} eq "uhc") {
    337     undef $/;
    338     my $text = <$handle>;
    339     $/ = "\n";
    340     $$outputref .= &unicode::unicode2utf8(&cjk::uhc2unicode ($text));
    341     return;
    342     }
    343 
    344     # if we get to here we assume it's a simple 8 bit encoding
     259
     260    # everything else uses unicode::convert2unicode
    345261    undef $/;
    346262    my $text = <$handle>;
    347263    $/ = "\n";
    348     $$outputref .= &unicode::unicode2utf8(&unicode::simple2unicode ($self->{'encoding'}, $text));
     264    $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
    349265}
    350266
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1868 r1870  
    2828use parsargv;
    2929use multiread;
     30use encodings;
    3031use cnseg;
    3132use acronym;
     
    3435use diagnostics;
    3536use DateExtract;
    36 use iso639;
    37 
    38 # if textcat returns an encoding that isn't in this list
    39 # we'll print a warning and use the default encoding instead
    40 %supported_encodings = (
    41             "ascii" => "",
    42             "utf8" => "",
    43             "iso_8859_1" => "",
    44             "windows_1252" => "",
    45             "iso_8859_2" => "",
    46             "windows_1250" => "",
    47             "iso_8859_3" => "",
    48             "iso_8859_4" => "",
    49             "iso_8859_5" => "",
    50             "windows_1251" => "",
    51             "koi8_r" => "",
    52             "koi8_u" => "",
    53             "iso_8859_6" => "",
    54             "windows_1256" => "",
    55             "iso_8859_7" => "",
    56             "windows_1253" => "",
    57             "iso_8859_8" => "",
    58             "windows_1255" => "",
    59             "iso_8859_9" => "",
    60             "windows_1254" => "",
    61             "gb" => "",
    62             "iscii_de" => "",
    63             "windows_1257" => "",
    64             "windows_874" => "",
    65             "windows_1258" => "",
    66             "shift_jis" => "",
    67             "euc_jp" => "",
    68             "uhc" => ""
    69             );
    7037
    7138sub print_general_usage {
     
    10976    print STDERR "                       unicode: just unicode\n";
    11077
    111     print STDERR "                       iso_8859_1: Latin1 (western european languages)\n";
    112     print STDERR "                       windows_1252: Windows codepage 1252 (WinLatin1)\n";
    113 
    114     print STDERR "                       iso_8859_2: Latin2 (central and eastern european languages)\n";
    115     print STDERR "                       windows_1250: Windows codepage 1250 (WinLatin2)\n";
    116 
    117     print STDERR "                       iso_8859_3: Latin3\n";
    118 
    119     print STDERR "                       iso_8859_4: Latin4\n";
    120 
    121     print STDERR "                       iso_8859_5: Cyrillic\n";
    122     print STDERR "                       windows_1251: Windows codepage 1251 (WinCyrillic)\n";
    123     print STDERR "                       koi8_r: Cyrillic - Russian\n";
    124     print STDERR "                       koi8_u: Cyrillic - Ukrainian\n";
    125 
    126     print STDERR "                       iso_8859_6: Arabic\n";
    127     print STDERR "                       windows_1256: Windows codepage 1256 (WinArabic)\n";
    128 
    129     print STDERR "                       iso_8859_7: Greek\n";
    130     print STDERR "                       windows_1253: Windows codepage 1253 (WinGreek)\n";
    131 
    132     print STDERR "                       iso_8859_8: Hebrew\n";
    133     print STDERR "                       windows_1255: Windows codepage 1255 (WinHebrew)\n";
    134 
    135     print STDERR "                       iso_8859_9: Latin5\n";
    136     print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n";
    137 
    138     print STDERR "                       gb: GB or GBK simplified Chinese\n";
    139 
    140     print STDERR "                       iscii_de: ISCII Devanagari\n";
    141 
    142     print STDERR "                       windows_1257: Windows codepage 1257 (WinBaltic)\n";
    143 
    144     print STDERR "                       windows_874: Windows codepage 874 (Thai)\n";
    145 
    146     print STDERR "                       windows_1258: Windows codepage 1258 (Vietnamese)\n";
    147 
    148     print STDERR "                       shift_jis: Shift-JIS (Japanese)\n";
    149     print STDERR "                       euc_jp: EUC encoded Japanese\n";
    150 
    151     print STDERR "                       uhc: Unified Hangul Code (Korean). This is a superset of\n";
    152     print STDERR "                            EUC encoded Korean\n\n";
    153 
     78    my $e = $encodings::encodings;
     79    foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
     80    print STDERR "                       $enc: $e->{$enc}->{'name'}\n";
     81    }
    15482
    15583    print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
     
    196124   
    197125    my $enc = "^(";
    198     map {$enc .= "|$_";} keys %supported_encodings;
    199     my $denc = $enc . "|unicode)\$";
    200     $enc .= "|unicode|auto)\$";
     126    map {$enc .= "|$_";} keys %$encodings::encodings;
     127    my $denc = $enc . "ascii|utf8|unicode)\$";
     128    $enc .= "ascii|utf8|unicode|auto)\$";
    201129   
    202130    $self->{'outhandle'} = STDERR;
     
    444372    my @results = textcat::classify($text);
    445373
    446 #    foreach $i (@results) {
    447 #   print STDERR "i: $i\n";
    448 #    }
    449 
    450374    if (scalar @results != 1) {
    451375   
     
    468392    # format language/encoding
    469393    my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
    470     die "Invalid language\n" if !defined $language;
    471 
     394    if (!defined $language) {
     395    if ($self->{'verbosity'}) {
     396        print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
     397        print $outhandle "defaulting to $self->{'default_language'}\n";
     398    }
     399    $language = $self->{'default_language'};
     400    }
    472401    if (!defined $encoding) {
    473     # if textcat returned no encoding info it is assumed to be iso_8859_1
    474     $encoding = "iso_8859_1";
    475     }
    476 
    477     if (!defined $supported_encodings{$encoding}) {
     402    if ($self->{'verbosity'}) {
     403        print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
     404        print $outhandle "defaulting to $self->{'default_encoding'}\n";
     405    }
     406    $encoding = $self->{'default_encoding'};
     407    }
     408
     409    if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
     410    !defined $encodings::encodings->{$encoding}) {
    478411    if ($self->{'verbosity'}) {
    479412        print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
  • trunk/gsdl/perllib/unicode.pm

    r1868 r1870  
    3030
    3131package unicode;
    32 
    33 %translations = ();
     32use encodings;
    3433
    3534# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
     
    4948}
    5049
    51 # windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
    52 # and returns a unicode array. These encodings are similar to but not
    53 # identical to the corresponding ISO-8859 encodings.
    54 #
    55 # $encoding should be the code page name (e.g. '1252')
    56 #
    57 # The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
    58 sub windows2unicode {
    59     my ($encoding, $in) = @_;
    60     my $out = [];
    61 
    62     my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
    63                       "WINDOWS", "$encoding.TXT");
    64     return $out unless &loadmapping ($encoding, $mapfile);
    65 
    66     my $i = 0;
    67     my $len = length($in);
    68     while ($i < $len) {
    69     my $c = ord(substr ($in, $i, 1));
    70     $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
    71     push (@$out, $c);
    72     $i++;
    73     }
    74 
    75     return $out;
    76 }
    77 
    78 # iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
    79 # and returns a unicode array. This function is much like windows2unicode()
    80 # except that only characters >= 0xA0 are read from the mapping file (since
    81 # all characters below that are the same for all iso-8859 character sets
    82 # and therefore already the same as unicode).
    83 #
    84 # Note that while this function will work for iso-8859-1 (latin 1) it'll be
    85 # much faster to use ascii2unicode() or ascii2utf8()
    86 #
    87 # $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
    88 # encoding is
    89 #
    90 # The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
    91 sub iso2unicode {
    92     my ($encoding, $in) = @_;
    93     my $out = [];
    94 
    95     my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
    96                       "ISO_8859", "$encoding.TXT");
    97     return $out unless &loadmapping ($encoding, $mapfile);
    98 
    99     my $i = 0;
    100     my $len = length($in);
    101     while ($i < $len) {
    102     my $c = ord(substr ($in, $i, 1));
    103     $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
    104     push (@$out, $c);
    105     $i++;
    106     }
    107 
    108     return $out;
    109 }
    110 
    111 # cyrillic2unicode is basically identical to windows2unicode, the only
    112 # difference being that the map files live in unicode/MAPPINGS/CYRILLIC
    113 #
    114 # values for $encoding may be 'koi8_r' or 'koi8_u'
    115 sub cyrillic2unicode {
    116     my ($encoding, $in) = @_;
    117     my $out = [];
    118 
    119     my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
    120                       "CYRILLIC", "$encoding.txt");
    121     return $out unless &loadmapping ($encoding, $mapfile);
    122 
    123     my $i = 0;
    124     my $len = length($in);
    125     while ($i < $len) {
    126     my $c = ord(substr ($in, $i, 1));
    127     $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
    128     push (@$out, $c);
    129     $i++;
    130     }
    131 
    132     return $out;
    133 }
    134 
    135 # iscii2unicode is basically identical to iso2unicode, the only
    136 # difference being that the map files live in unicode/MAPPINGS/ISCII
    137 #
    138 # values for $encoding may be 'Devanagari' only at present
    139 sub iscii2unicode {
    140     my ($encoding, $in) = @_;
    141     my $out = [];
    142 
    143     my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
    144                       "ISCII", "$encoding.txt");
    145     return $out unless &loadmapping ($encoding, $mapfile);
    146 
    147     my $i = 0;
    148     my $len = length($in);
    149     while ($i < $len) {
    150     my $c = ord(substr ($in, $i, 1));
    151     $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
    152     push (@$out, $c);
    153     $i++;
    154     }
    155 
    156     return $out;
    157 }
    158 
    159 # ascii2utf8 takes a (extended) ascii string and
    160 # returns a UTF-8 encoded string. This is just
    161 # a faster version of "&unicode2utf8(&ascii2unicode($str));"
     50# ascii2utf8 takes a reference to an (extended) ascii string and returns a
     51# UTF-8 encoded string. This is just a faster version of
     52# "&unicode2utf8(&ascii2unicode($str));"
    16253sub ascii2utf8 {
    16354    my ($in) = @_;
     
    16657    my ($c);
    16758    my $i = 0;
    168     my $len = length($in);
    169     while ($i < $len) {
    170     $c = ord (substr ($in, $i, 1));
     59    my $len = length($$in);
     60    while ($i < $len) {
     61    $c = ord (substr ($$in, $i, 1));
    17162    if ($c < 0x80) {
    17263        # ascii character
     
    18374    return $out;
    18475}
    185 
    18676
    18777# unicode2utf8 takes a unicode array as input and encodes it
     
    210100    }
    211101    }
    212 
    213     return $out;
    214 }
    215 
     102    return $out;
     103}
    216104
    217105# utf82unicode takes a utf-8 string and produces a unicode
     
    268156}
    269157
    270 
    271158# unicode2ucs2 takes a unicode array and produces a UCS-2
    272159# unicode string (every two bytes forms a unicode character)
     
    282169    return $out;
    283170}
    284 
    285171
    286172# ucs22unicode takes a UCS-2 string and produces a unicode array
     
    301187}
    302188
    303 # loadmapping expects the mapfile to contain (at least) two
    304 # tab-separated fields. The first field is the mapped value
    305 # and the second field is the unicode value.
    306 #
    307 # It returns 1 if successful, 0 if unsuccessful
    308 sub loadmapping {
    309     my ($encoding, $mapfile) = @_;
     189# takes a reference to a string and returns a reference to a unicode array
     190sub convert2unicode {
     191    my ($encoding, $textref) = @_;
     192
     193    if (!defined $encodings::encodings->{$encoding}) {
     194    print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
     195    return [];
     196    }
     197
     198    my $encodename = "$encoding-unicode";
     199    my $enc_info = $encodings::encodings->{$encoding};
     200    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
     201                      "to_uc", $enc_info->{'mapfile'});
     202    if (!&loadmapencoding ($encodename, $mapfile)) {
     203    print STDERR "unicode: ERROR - could not load encoding $encodename\n";
     204    return [];
     205    }
    310206   
    311     my $to = "$encoding-unicode";
    312     my $from = "unicode-$encoding";
    313 
    314     # check to see if the encoding has already been loaded
    315     if (defined $translations{$to} && defined $translations{$from}) {
    316     return 1;
    317     }
    318 
    319     if (!open (MAPFILE, $mapfile)) {
    320     print STDERR "ERROR: unable to load mapfile $mapfile\n";
     207    if (defined $enc_info->{'converter'}) {
     208    my $converter = $enc_info->{'converter'};
     209    return &$converter ($encodename, $textref);
     210    }
     211
     212    if ($translations{$encodename}->{'count'} == 1) {
     213    return &singlebyte2unicode ($encodename, $textref);
     214    } else {
     215    return &doublebyte2unicode ($encodename, $textref);
     216    }
     217}
     218
     219# singlebyte2unicode converts simple 8 bit encodings where characters below
     220# 0x80 are normal ascii characters and the rest are decoded using the
     221# appropriate mapping files.
     222#
     223# Examples of encodings that may be converted using singlebyte2unicode are
     224# the iso-8859 and windows-125* series).
     225sub singlebyte2unicode {
     226    my ($encodename, $textref) = @_;
     227
     228    my @outtext = ();
     229    my $len = length($$textref);
     230    my ($c);
     231    my $i = 0;
     232
     233    while ($i < $len) {
     234    if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
     235        # normal ascii character
     236        push (@outtext, $c);
     237    } else {
     238        $c = &transchar ($encodename, $c);
     239        # put a black square if cannot translate
     240        $c = 0x25A1 if $c == 0;
     241        push (@outtext, $c);
     242    }
     243    $i ++;
     244    }
     245    return \@outtext;
     246}
     247
     248# doublebyte2unicode converts simple two byte encodings where characters
     249# below code point 0x80 are single-byte characters and the rest are
     250# double-byte characters.
     251#
     252# Examples of encodings that may be converted using doublebyte2unicode are
     253# CJK encodings like GB encoded Chinese and UHC Korean.
     254#
     255# Note that no error checking is performed to make sure that the input text
     256# is valid for the given encoding.
     257#
     258# Also, encodings that may contain characters of more than two bytes are
     259# not supported (any EUC encoded text may in theory contain 3-byte
     260# characters but in practice only one and two byte characters are used).
     261sub doublebyte2unicode {
     262    my ($encodename, $textref) = @_;   
     263   
     264    my @outtext = ();
     265    my $len = length($$textref);
     266    my ($c1, $c2);
     267    my $i = 0;
     268
     269    while ($i < $len) {
     270    if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
     271        if ($i+1 < $len) {
     272        # double-byte character
     273        $c2 = ord(substr($$textref, $i+1, 1));
     274        my $c = &transchar ($encodename, ($c1 << 8) | $c2);
     275        # put a black square if cannot translate
     276        $c = 0x25A1 if $c == 0;
     277        push (@outtext, $c);
     278        $i += 2;
     279       
     280        } else {
     281        # error
     282        print STDERR "unicode: ERROR missing second half of double-byte character\n";
     283        $i++;
     284        }
     285       
     286    } else {
     287        # single-byte character
     288        push (@outtext, $c1);
     289        $i++;
     290    }
     291    }
     292    return \@outtext;
     293}
     294
     295# Shift-JIS to unicode
     296# We can't use doublebyte2unicode for Shift-JIS because it uses some
     297# single-byte characters above code point 0x80 (i.e. half-width katakana
     298# characters in the range 0xA1-0xDF)
     299sub shiftjis2unicode {
     300    my ($encodename, $textref) = @_;
     301   
     302    my @outtext = ();
     303    my $len = length($$textref);
     304    my ($c1, $c2);
     305    my $i = 0;
     306
     307    while ($i < $len) {
     308    $c1 = ord(substr($$textref, $i, 1));
     309
     310    if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
     311        # Single-byte half-width katakana character or
     312        # JIS Roman yen or overline characters
     313        my $c = &transchar ($encodename, $c1);
     314        # - put a black square if cannot translate
     315        $c = 0x25A1 if $c == 0;
     316        push (@outtext, $c);
     317        $i++;
     318
     319    } elsif ($c1 < 0x80) {
     320        # ASCII
     321        push (@outtext, $c1);
     322        $i ++;
     323
     324    } elsif ($c1 < 0xEF) {
     325        if ($i+1 < $len) {
     326        $c2 = ord(substr($$textref, $i+1, 1));
     327        if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
     328            # Double-byte shift-jis character
     329            my $c = &transchar ($encodename, ($c1 << 8) | $c2);
     330            # put a black square if cannot translate
     331            $c = 0x25A1 if $c == 0;
     332            push (@outtext, $c);
     333        } else {
     334            # error
     335            print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
     336        }
     337        $i += 2;
     338        } else {
     339        # error
     340        print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
     341        $i ++;
     342        }
     343    } else {
     344        # error
     345        print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
     346        $i ++;
     347    }
     348    }
     349    return \@outtext;
     350}
     351
     352sub transchar {
     353    my ($encoding, $from) = @_;
     354    my $high = ($from / 256) % 256;
     355    my $low = $from % 256;
     356
     357    return 0 unless defined $translations{$encoding};
     358
     359    my $block = $translations{$encoding}->{'map'};
     360
     361    if (ref ($block->[$high]) ne "ARRAY") {
    321362    return 0;
    322363    }
    323 
    324     my ($line, @line);
    325     $translations{$to} = {};
    326     $translations{$from} = {};
    327     while (defined ($line = <MAPFILE>)) {
    328     # remove comments
    329     $line =~ s/\#.*$//;
    330     next unless $line =~ /\S/;
    331 
    332     # split the line into fields and do a few
    333     # simple sanity checks
    334     @line = split (/\t/, $line);
    335     next unless (scalar(@line) >= 2 &&
    336              $line[0] =~ /^0x/ &&
    337              $line[1] =~ /^0x/);
    338 
    339     my $a = hex($line[0]);
    340     my $b = hex($line[1]);
    341 
    342     $translations{$to}->{$a} = $b;
    343     $translations{$from}->{$b} = $a;
    344     }
    345 
    346     close (MAPFILE);
    347 
    348     return 1;
    349 }
    350 
    351 
    352 
    353 
    354 
    355 
    356 
    357 
    358 
    359 ####################################################################################################
    360 
     364    return $block->[$high]->[$low];
     365}
    361366
    362367# %translations is of the form:
    363368#
    364 # encodings{encodingname-encodingname}->blocktranslation
     369# encodings{encodingname-encodingname}->{'map'}->blocktranslation
    365370# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
    366371#
     
    388393         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    389394         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
    390 
    391 $encodings = {
    392     'iso_8859_1' => {'fullname' => 'Latin1 (western languages)',
    393              'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0},
    394 
    395     'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)',
    396              'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0},
    397 
    398     'iso_8859_3' => {'fullname' => 'Latin3',
    399              'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0},
    400 
    401     'iso_8859_4' => {'fullname' => 'Latin4',
    402              'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0},
    403 
    404     'iso_8859_5' => {'fullname' => 'Cyrillic',
    405              'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0},
    406 
    407     'iso_8859_6' => {'fullname' => 'Arabic',
    408              'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0},
    409 
    410     'iso_8859_7' => {'fullname' => 'Greek',
    411              'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0},
    412 
    413     'iso_8859_8' => {'fullname' => 'Hebrew',
    414              'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0},
    415 
    416     'iso_8859_9' => {'fullname' => 'Latin5',
    417              'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0},
    418 
    419     'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)',
    420                'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80},
    421 
    422     'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)',
    423                'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80},
    424 
    425     'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)',
    426                'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80},
    427 
    428     'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',
    429                'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80},
    430 
    431     'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)',
    432                'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80},
    433 
    434     'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',
    435                'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80},
    436 
    437     'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',
    438                'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80},
    439 
    440     'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)',
    441                'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80},
    442 
    443     'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)',
    444                'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80},
    445 
    446     'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)',
    447               'mapfile' => 'win874.ump', 'ascii_delim' => 0x80},
    448 
    449     'koi8_r' => {'fullname' => 'Cyrillic',
    450          'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80},
    451 
    452     'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)',
    453          'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80},
    454 
    455     'iscii_de' => {'fullname' => 'ISCII Devanagari',
    456            'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0}
    457 };
    458 
    459 # returns a pointer to unicode array
    460 sub simple2unicode {
    461     my ($encoding, $intext) = @_;
    462 
    463     if (!defined ($encodings->{$encoding})) {
    464     print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n";
    465     return [];
    466     }
    467 
    468     my $info = $encodings->{$encoding};
    469     my $encodename = "$encoding-unicode";
    470     my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc",
    471                       $info->{'mapfile'});
    472 
    473     if (!&loadmapencoding ($encodename, $mapfile)) {
    474     print STDERR "unicode: ERROR - could not load encoding $encodename\n";
    475     return [];
    476     }
    477    
    478     my @outtext = ();
    479     my $len = length($intext);
    480     my ($c);
    481     my $i = 0;
    482 
    483     while ($i < $len) {
    484     if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) {
    485         # normal ascii character
    486         push (@outtext, $c);
    487     } else {
    488         push (@outtext, &transchar ($encodename, $c));
    489     }
    490     $i ++;
    491     }
    492     return \@outtext;
    493 }
    494395
    495396# returns 1 if successful, 0 if unsuccessful
     
    503404    binmode (MAPFILE);
    504405
    505     $translations{$encoding} = [@array256];
     406    $translations{$encoding} = {'map' => [@array256], 'count' => 0};
    506407    my $block = $translations{$encoding};
    507408
     
    509410    while (read(MAPFILE, $in, 1) == 1) {
    510411    $i = unpack ("C", $in);
    511     $block->[$i] = [@array256];
     412    $block->{'map'}->[$i] = [@array256];
    512413    for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
    513414        my ($n1, $n2) = unpack ("CC", $in);
    514         $block->[$i]->[$j] = ($n1*256) + $n2;
    515     }
     415        $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
     416    }
     417    $block->{'count'} ++;
    516418    }
    517419
     
    519421}
    520422
    521 sub transchar {
    522     my ($encoding, $from) = @_;
    523     my $high = ($from / 256) % 256;
    524     my $low = $from % 256;
    525 
    526     return 0 unless defined $translations{$encoding};
    527 
    528     my $block = $translations{$encoding};
    529 
    530     if (ref ($block->[$high]) ne "ARRAY") {
    531     return 0;
    532     }
    533     return $block->[$high]->[$low];
    534 }
    535 
    536 
    537 
    538 
    5394231;
    540 
  • trunk/gsdl/src/recpt/converter.cpp

    r1285 r1870  
    2828
    2929
    30 // the converters within converterinfo become the property of
    31 // of this class after add_converter has been called. The converters
    32 // remain the responsability of the calling code and will not be
    33 // deleted by this class.
     30// the converters within converterinfo become the property of this class
     31// after add_converter has been called. The converters remain the
     32// responsability of the calling code and will not be deleted by this
     33// class.
    3434void convertinfoclass::add_converter (const text_t &name, inconvertclass *inconverter,
    3535                      rzwsoutconvertclass *outconverter) {
  • trunk/gsdl/src/recpt/converter.h

    r1285 r1870  
    9898  size_type size() const {return converters.size();}
    9999
     100  const_iterator find(text_t &key) {converters.find(key);}
    100101
    101102  // added functionality
  • trunk/gsdl/src/recpt/librarymain.cpp

    r1860 r1870  
    3939#include "mgsearch.h"
    4040#include "mgppsearch.h"
    41 #include "fileutil.h"
    4241#include "collectset.h"
    4342#include <assert.h>
     
    6463#include "htmlbrowserclass.h"
    6564#include "phindbrowserclass.h"
    66 
    67 #include "recptconfig.h"
    6865
    6966int main () {
     
    176173  // add the protocol to the receptionist
    177174  recpt.add_protocol (&nproto);
    178 
    179   // z39.50 stuff - johnmcp
    180175
    181176  // z39.50 stuff - johnmcp
     
    200195#endif
    201196
    202   // Read main.cfg to get all the "Encoding" lines and add corresponding converters.
    203   // It might be possible to move this to somewhere like receptionist::configure, depending
    204   // on whether we need the converters before then (I don't think we do).
    205   text_tarray cfgline;
    206   text_t maincfg = filename_cat (gsdlhome, "etc", "main.cfg");
    207   if (file_exists (maincfg)) {
    208     char *maincfgc = maincfg.getcstr();
    209 #ifdef GSDL_USE_IOS_H
    210     ifstream confin (maincfgc, ios::in | ios::nocreate);
    211 #else
    212     ifstream confin (maincfgc, ios::in);
    213 #endif
    214     delete maincfgc;
    215  
    216     if (confin) {
    217       text_t subkey, subvalue, shortname;
    218       text_tset saved;
    219       text_tmap tmp;
    220       text_t::const_iterator cfglinesub_here;
    221       text_tarray::const_iterator cfgline_here;
    222       text_tarray::const_iterator cfgline_end;
    223       while (read_cfg_line(confin, cfgline) >= 0) {
    224     if (cfgline.size () >= 4 && cfgline[0] == "Encoding") {
    225       tmp.erase(tmp.begin(), tmp.end());
    226       cfgline_here = cfgline.begin();
    227       cfgline_end = cfgline.end();
    228       while (cfgline_here != cfgline_end) {
    229         cfglinesub_here = getdelimitstr((*cfgline_here).begin(),
    230                         (*cfgline_here).end(), '=', subkey);
    231         if (subkey == "shortname") {
    232           shortname = substr (cfglinesub_here, (*cfgline_here).end());
    233         } else {
    234           tmp[subkey] = substr (cfglinesub_here, (*cfgline_here).end());
    235         }
    236         cfgline_here++;
    237       }
    238       // we just use the saved set to prevent multiple encodings being added
    239       // that use the same shortname (i.e. any encodings after the first with
    240       // the same name will be ignored).
    241       if (!shortname.empty() && saved.find(shortname) == saved.end()) {
    242         saved.insert(shortname);
    243 
    244         if (tmp["type"] == "UTF8") {
    245           utf8inconvertclass *utf8inconvert = new utf8inconvertclass();
    246           utf8outconvertclass *utf8outconvert = new utf8outconvertclass();
    247           recpt.add_converter (shortname, utf8inconvert, utf8outconvert);
    248 
    249         } else if (tmp["type"] == "GB") {
    250           mapinconvertclass *gbinconvert = new mapinconvertclass();
    251           gbinconvert->setmapfile (gsdlhome, "gbku", 0x25a1);
    252           mapoutconvertclass *gboutconvert = new mapoutconvertclass();
    253           gboutconvert->setmapfile (gsdlhome, "ugbk", 0xa1f5);
    254           recpt.add_converter (shortname, gbinconvert, gboutconvert);
    255 
    256         } else {
    257           if (!tmp["mapfile"].empty()) {
    258 
    259         if (tmp["type"] == "ISO_8859" && tmp["mapfile"] == "1.TXT") {
    260           // iso-8859-1 is a special case as it'll always be
    261           // supported by the standard converter class and
    262           // therefore doesn't need to use its mapping file
    263           inconvertclass *inconvert = new inconvertclass();
    264           rzwsoutconvertclass *outconvert = new rzwsoutconvertclass();
    265           recpt.add_converter (shortname, inconvert, outconvert); 
    266 
    267         } else {
    268           text_t mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS", tmp["type"], tmp["mapfile"]);
    269           if (file_exists (mapfile)) {
    270             simplemapinconvertclass *inconvert = new simplemapinconvertclass();
    271             inconvert->setmapfile (mapfile);
    272             simplemapoutconvertclass *outconvert = new simplemapoutconvertclass();
    273             outconvert->setmapfile (mapfile);
    274             recpt.add_converter (shortname, inconvert, outconvert); 
    275           }
    276         }
    277           }
    278         }
    279       }
    280     }
    281       }
    282       confin.close ();
    283     }
    284   }
    285 
    286 
    287   // add other converters
    288   //  utf8inconvertclass utf8inconvert;
    289   //  utf8outconvertclass utf8outconvert;
    290   //  recpt.add_converter ("u", &utf8inconvert, &utf8outconvert);
    291 
    292   //  mapinconvertclass gbinconvert;
    293   //  gbinconvert.setmapfile (gsdlhome, "gbku", 0x25a1);
    294   //  mapoutconvertclass gboutconvert;
    295   //  gboutconvert.setmapfile (gsdlhome, "ugbk", 0xa1f5);
    296   //  recpt.add_converter ("g", &gbinconvert, &gboutconvert);
    297 
    298   // arabic
    299   //  text_t mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");
    300   //  mapfile = filename_cat (mapfile, "WINDOWS", "1256.TXT");
    301   //  simplemapinconvertclass arinconvert;
    302   //  arinconvert.setmapfile (mapfile);
    303   //  simplemapoutconvertclass aroutconvert;
    304   //  aroutconvert.setmapfile (mapfile);
    305   //  recpt.add_converter ("a", &arinconvert, &aroutconvert); 
    306 
    307   // cyrillic
    308   //  mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");
    309   //  mapfile = filename_cat (mapfile, "WINDOWS", "1251.TXT");
    310   //  simplemapinconvertclass cyinconvert;
    311   //  cyinconvert.setmapfile (mapfile);
    312   //  simplemapoutconvertclass cyoutconvert;
    313   //  cyoutconvert.setmapfile (mapfile);
    314   //  recpt.add_converter ("c", &cyinconvert, &cyoutconvert); 
    315  
    316   // hindi
    317 //    armapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");
    318 //    armapfile = filename_cat (armapfile, "ISCII", "Devanagari.txt");
    319 //    simplemapinconvertclass arinconvert;
    320 //    arinconvert.setmapfile (armapfile);
    321 //    simplemapoutconvertclass aroutconvert;
    322 //    aroutconvert.setmapfile (armapfile);
    323 //    recpt.add_converter ("a", &arinconvert, &aroutconvert); 
    324 
    325 
    326197  // the list of actions. Note: these actions will become invalid
    327198  // at the end of this function.
  • trunk/gsdl/src/recpt/pageaction.cpp

    r1861 r1870  
    353353      }
    354354
    355       if (pref_langs.find("zh") == pref_langs.end())
    356     disp.setmacro ("encodingoption", "preferences", "");
    357 
    358355    } else {
    359356      while (tlang != elang) {
     
    373370    // create the "encoding" selection box for the preferences page
    374371    text_t &arg_w = args["w"];
    375     // put encodings in another map to sort them by longname
    376     text_tmap encodings;
    377     encodinginfo_tmap::const_iterator thisenc = configinfo.encodings.begin();
    378     encodinginfo_tmap::const_iterator endenc = configinfo.encodings.end();
     372    text_t encodingoption;
     373    text_tmap::const_iterator thisenc = configinfo.encodings.begin();
     374    text_tmap::const_iterator endenc = configinfo.encodings.end();
    379375    while (thisenc != endenc) {
    380       encodings[(*thisenc).second.longname] = (*thisenc).first;
    381       thisenc++;
    382     }
    383     text_tmap::iterator tenc = encodings.begin();
    384     text_tmap::iterator eenc = encodings.end();
    385 
    386     text_t encodingoption;
    387     while (tenc != eenc) {
    388       encodingoption += "<option value=\"" + (*tenc).second + "\"";
    389       if ((*tenc).second == arg_w) encodingoption += " selected";
    390       encodingoption += ">" + (*tenc).first + "\n";
    391       tenc ++;
     376      encodingoption += "<option value=\"" + (*thisenc).second + "\"";
     377      if ((*thisenc).second == arg_w) encodingoption += " selected";
     378      encodingoption += ">" + (*thisenc).first + "\n";
     379      thisenc ++;
    392380    }
    393381
  • trunk/gsdl/src/recpt/receptionist.cpp

    r1861 r1870  
    9494}
    9595
    96 void encodinginfo_t::clear () {
    97   longname.clear();
    98   label.clear();
    99 }
    100 
    10196receptionist::receptionist () {
    10297  // create a list of cgi arguments
     
    207202void receptionist::configure (const text_t &key, const text_tarray &cfgline) {
    208203  // configure the receptionist
     204
    209205  if (cfgline.size() >= 1) {
    210206    cgiarginfo *info = NULL;
     
    317313
    318314    } else if (key == "Encoding") {
    319       text_t subkey, subvalue;
    320       text_t shortname, longname, label;
     315
     316      configure_encoding (cfgline);
     317
     318    } else if (key == "Language") {
     319      text_t subkey, subvalue, shortname;
     320      languageinfo_t lang;
    321321      text_t::const_iterator cfglinesub_here;
    322322      text_tarray::const_iterator cfgline_here = cfgline.begin();
     
    328328      shortname = substr (cfglinesub_here, (*cfgline_here).end());
    329329    } else if (subkey == "longname") {
    330       longname = substr (cfglinesub_here, (*cfgline_here).end());
    331     } else if (subkey == "label") {
    332       label = substr (cfglinesub_here, (*cfgline_here).end());
    333     }
    334     cfgline_here++;
    335       }
    336       if (!shortname.empty() && !label.empty()) {
    337     encodinginfo_t enc;
    338     if (longname.empty()) enc.longname = shortname;
    339     else enc.longname = longname;
    340     enc.label = label;
    341     configinfo.encodings[shortname] = enc;
    342       }
    343 
    344     } else if (key == "Language") {
    345       text_t subkey, subvalue;
    346       text_t shortname, longname, defaultencoding;
    347       text_t::const_iterator cfglinesub_here;
    348       text_tarray::const_iterator cfgline_here = cfgline.begin();
    349       text_tarray::const_iterator cfgline_end = cfgline.end();
    350       while (cfgline_here != cfgline_end) {
    351     cfglinesub_here = getdelimitstr((*cfgline_here).begin(),
    352                     (*cfgline_here).end(), '=', subkey);
    353     if (subkey == "shortname") {
    354       shortname = substr (cfglinesub_here, (*cfgline_here).end());
    355     } else if (subkey == "longname") {
    356       longname = substr (cfglinesub_here, (*cfgline_here).end());
     330      lang.longname = substr (cfglinesub_here, (*cfgline_here).end());
    357331    } else if (subkey == "default_encoding") {
    358       defaultencoding = substr (cfglinesub_here, (*cfgline_here).end());
     332      lang.defaultencoding = substr (cfglinesub_here, (*cfgline_here).end());
    359333    }
    360334    cfgline_here++;
    361335      }
    362336      if (!shortname.empty()) {
    363     languageinfo_t lang;
    364     if (longname.empty()) lang.longname = shortname;
    365     else lang.longname = longname;
    366     lang.defaultencoding = defaultencoding;
     337    if (lang.longname.empty()) lang.longname = shortname;
    367338    configinfo.languages[shortname] = lang;
    368339      }
     
    415386
    416387
    417 // init should be called after all the actions, protocols, and
    418 // converters have been added to the receptionist and after everything
    419 // has been configured but before any pages are created.
    420 // It returns true on success and false on failure. If false is
    421 // returned getpage should not be called (without producing
    422 // meaningless output), instead an error page should be
    423 // produced by the calling code.
     388// init should be called after all the actions and protocols have been
     389// added to the receptionist and after everything has been configured but
     390// before any pages are created.  It returns true on success and false on
     391// failure. If false is returned getpage should not be called (without
     392// producing meaningless output), instead an error page should be produced
     393// by the calling code.
    424394bool receptionist::init (ostream &logout) {
     395
    425396  // first configure collectdir
    426397  text_t thecollectdir = configinfo.gsdlhome;
     
    470441  srand (time(NULL));
    471442
    472   // make the output converters remove all the zero-width spaces
    473   convertinfoclass::iterator converthere = converters.begin ();
    474   convertinfoclass::iterator convertend = converters.end ();
    475   while (converthere != convertend) {
    476     assert ((*converthere).second.outconverter != NULL);
    477     if ((*converthere).second.outconverter != NULL) {
    478       (*converthere).second.outconverter->set_rzws(1);
    479     }
    480     converthere++;
    481   }
    482 
    483443  // if maintainer email address is something dodgy (for now I'll define
    484444  // dodgy as being anything that doesn't contain '@') disable EmailEvents
     
    539499
    540500  // make sure the encoding is valid
    541   if (configinfo.encodings.find(default_encoding) == configinfo.encodings.end()) return "";
     501  if (converters.find(default_encoding) == converters.end()) return "";
    542502
    543503  return default_encoding;
     
    828788  // add the encoding information
    829789  if (response == content) {
    830     if (configinfo.encodings.find(args["w"]) != configinfo.encodings.end()) {
    831       response_data += "; charset=" + configinfo.encodings[args["w"]].label;
     790    if (converters.find(args["w"]) != converters.end()) {
     791      response_data += "; charset=" + args["w"];
    832792    } else {
    833793      // default to latin 1
     
    13351295}
    13361296
    1337 
     1297// Handles an "Encoding" line from a configuration file - note that the
     1298// configinfo.encodings map is a bit of a hack (to be fixed when the
     1299// configuration files are tidied up).
     1300void receptionist::configure_encoding (const text_tarray &cfgline) {
     1301
     1302  text_t subkey, subvalue, shortname, longname, mapfile;
     1303  text_t::const_iterator cfglinesub_here;
     1304  text_tarray::const_iterator cfgline_here = cfgline.begin();
     1305  text_tarray::const_iterator cfgline_end = cfgline.end();
     1306  while (cfgline_here != cfgline_end) {
     1307    cfglinesub_here = getdelimitstr((*cfgline_here).begin(),
     1308                    (*cfgline_here).end(), '=', subkey);
     1309    if (subkey == "shortname") {
     1310      shortname = substr (cfglinesub_here, (*cfgline_here).end());
     1311    } else if (subkey == "longname") {
     1312      longname = substr (cfglinesub_here, (*cfgline_here).end());
     1313    } else if (subkey == "map") {
     1314      mapfile = substr (cfglinesub_here, (*cfgline_here).end());
     1315    }
     1316    cfgline_here++;
     1317  }
     1318  if (!shortname.empty()) {
     1319    if (longname.empty()) longname = shortname;
     1320
     1321    // add the converter
     1322    if (shortname == "utf-8") {
     1323      utf8inconvertclass *utf8inconvert = new utf8inconvertclass();
     1324      utf8outconvertclass *utf8outconvert = new utf8outconvertclass();
     1325      utf8outconvert->set_rzws(1);
     1326      add_converter (shortname, utf8inconvert, utf8outconvert);
     1327      configinfo.encodings[longname] = shortname;
     1328
     1329    } else if (!mapfile.empty()) {
     1330
     1331      if (mapfile == "8859_1.ump") {
     1332    // iso-8859-1 is a special case as it'll always be supported by the
     1333    // standard converter class and therefore doesn't need to use its
     1334    // mapping file
     1335    inconvertclass *inconvert = new inconvertclass();
     1336    rzwsoutconvertclass *outconvert = new rzwsoutconvertclass();
     1337    outconvert->set_rzws(1);
     1338    add_converter (shortname, inconvert, outconvert); 
     1339    configinfo.encodings[longname] = shortname;
     1340
     1341      } else {
     1342    text_t to_uc_map = filename_cat(configinfo.gsdlhome, "mappings", "to_uc", mapfile);
     1343    text_t from_uc_map = filename_cat(configinfo.gsdlhome, "mappings", "from_uc", mapfile);
     1344    if (file_exists(to_uc_map) && file_exists(from_uc_map)) {
     1345
     1346      mapinconvertclass *mapinconvert = new mapinconvertclass();
     1347      mapinconvert->setmapfile (to_uc_map, 0x003F);
     1348      mapoutconvertclass *mapoutconvert = new mapoutconvertclass();
     1349      mapoutconvert->setmapfile (from_uc_map, 0x3F);
     1350      mapoutconvert->set_rzws(1);
     1351      add_converter (shortname, mapinconvert, mapoutconvert);
     1352      configinfo.encodings[longname] = shortname;
     1353    }
     1354      }
     1355    }
     1356  }
     1357}
  • trunk/gsdl/src/recpt/receptionist.h

    r1860 r1870  
    6969};
    7070
    71 struct encodinginfo_t {
    72   void clear();
    73   encodinginfo_t () {clear();}
    74  
    75   text_t longname;
    76   text_t label;
    77 };
    78 
    7971typedef map<text_t, collectioninfo_t, lttext_t> colinfo_tmap;
    8072typedef map<text_t, languageinfo_t, lttext_t> languageinfo_tmap;
    81 typedef map<text_t, encodinginfo_t, lttext_t> encodinginfo_tmap;
    8273
    8374enum events_t {Disabled, CollectorEvents, AllEvents};
     
    111102 
    112103  languageinfo_tmap languages;
    113   encodinginfo_tmap encodings;
     104
     105  // encodings is just a simple mapping from encoding longnames to
     106  // shortnames.  It's useful for now for creating the pulldown menu of
     107  // encodings on the preferences page but isn't intended to be permanent.
     108  text_tmap encodings;
    114109
    115110  void clear ();
     
    150145  bool append_logstr (const text_t &filename, const text_t &logstr,
    151146              ostream &logout);
     147
     148  void configure_encoding (const text_tarray &cfgline);
    152149
    153150public:
Note: See TracChangeset for help on using the changeset viewer.