Changeset 1870

trunk/gsdl/bin/script/makemapfile.pl

-              r1868
+              r1870
 ###########################################################################
-# Creates a binary map file for use by complex character encodings
-# (e.g. CJK encodings like GBK and Shift-JIS). The map file is written to
-# the $GSDLHOME/unicode directory.
 BEGIN {
     die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
 …
 use parsargv;
+use cjk;
+use util;
+# %translations is of the form:
+#
+# encodings{encodingname-encodingname}->blocktranslation
+# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
+#
+# Any of the top translation blocks can point to an undefined
+# value. This data structure aims to allow fast translation and
+# efficient storage.
+%translations = ();
+# @array256 is used for initialisation, there must be
+# a better way...
+@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
 &main();
 …
+    }
+    &cjk::makeencodingmapfile ($encoding, $mapfile);
+    if (!&loadencoding ($encoding, $mapfile)) {
+    die "couldn't load encoding $encoding";
+    }
+    # write out map files
+    &writemapfile ("$encoding-unicode", $encoding, 1);
+    &writemapfile ("unicode-$encoding", $encoding, 0);
+}
+sub writemapfile {
+    my ($encoding, $filename, $tounicode) = @_;
+    $filename .= ".ump"; # unicode map file
+    if ($tounicode) {
+    $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename);
+    } else {
+    $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename);
+    }
+    die "translation not defined" if (!defined $translations{$encoding});
+    my $block = $translations{$encoding};
+    print "writing $filename\n";
+    open (MAPFILE, ">" . $filename) || die;
+    binmode (MAPFILE);
+    my ($i, $j);
+    for ($i=0; $i<256; $i++) {
+    if (ref ($block->[$i]) eq "ARRAY") {
+        print MAPFILE pack ("C", $i);
+        for ($j=0; $j<256; $j++) {
+        # unsigned short in network order
+        print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256),
+                    $block->[$i]->[$j] % 256);
+        }
+    }
+    }
+    close (MAPFILE);
+}
+# loadencoding expects the mapfile to contain (at least) two
+# tab-separated fields. The first field is the mapped value
+# and the second field is the unicode value.
+#
+# It returns 1 if successful, 0 if unsuccessful
+sub loadencoding {
+    my ($encoding, $mapfile) = @_;
+    my $to = "$encoding-unicode";
+    my $from = "unicode-$encoding";
+    # check to see if the encoding has already been loaded
+    if (defined $translations{$to} && defined $translations{$from}) {
+    return 1;
+    }
+    return 0 unless open (MAPFILE, $mapfile);
+    my ($line, @line);
+    $translations{$to} = [@array256];
+    $translations{$from} = [@array256];
+    while (defined ($line = <MAPFILE>)) {
+    chomp $line;
+    # remove comments
+    $line =~ s/\#.*$//;
+    next unless $line =~ /\S/;
+    # split the line into fields and do a few
+    # simple sanity checks
+    @line = split (/\t/, $line);
+    next unless (scalar(@line) >= 2 &&
+             $line[0] =~ /^0x/ &&
+             $line[1] =~ /^0x/);
+    my $char = hex($line[0]);
+    my $unic = hex($line[1]);
+    # might need this for some versions of gb but not gbk
+#   $char = $char | 0x8080 unless ($encoding =~ /gbk/i);
+    &addchartrans ($translations{$to}, $char, $unic);
+    &addchartrans ($translations{$from}, $unic, $char);
+    }
+    close (MAPFILE);
+    return 1;
+}
+# addchartrans adds one character translation to a translation block.
+# It also simplifies the translation block if possible.
+sub addchartrans {
+    my ($block, $from, $to) = @_;
+    my $i = 0;
+    my $high = ($from / 256) % 256;
+    my $low = $from % 256;
+    if (ref ($block->[$high]) ne "ARRAY") {
+    $block->[$high] = [@array256];
+    }
+    $block->[$high]->[$low] = $to;
+}

trunk/gsdl/etc/main.cfg

-              r1868
+              r1870
 # Define the interface languages and encodings supported by this receptionist
+# An "Encoding" line defines an encoding to be used by the receptionist
+# options are:
+# shortname -- Identifier for the given encoding. The shortname option is
+#              mandatory and must be unique for each "Encoding" line.
+# An "Encoding" line defines an encoding to be used by the receptionist.
+# Uncomment "Encoding" lines to include an encoding on your "preferences" page.
+# Encoding line options are:
+# shortname -- The standard charset label for the given encoding. The
+#              shortname option is mandatory.
 # longname  -- The display name of the given encoding. If longname isn't set
 #              it will default to using shortname instead.
+# type      -- The type of encoding. Note that for most encodings this
+#              value is the directory name under which the map file for
+#              this encoding resides in the Greenstone unicode/MAPPINGS
+#              directory (e.g. 'WINDOWS', 'ISO_8859' etc.). It may also
+#              take the values 'CJK' and 'UTF8'.
+# mapfile   -- The name of the map file for use when converting between
+#              utf8 and the given encoding. The mapfile option is mandatory
+#              for all encoding types with the exception of UTF8. If type
+#              is CJK, mapfile is the abbreviated name of the encoding as
+#              used by the binary mapping files (.ump files). i.e. if the
+#              encoding uses the map files gbku.ump and ugbk.ump, mapfile
+#              will be set to "gbk".
+# label     -- The standard label to which you must set the value of
+#              "charset" within http headers or html meta tags to get a web
+#              browser to use the given encoding. The label option is
+#              mandatory.
+Encoding shortname=utf8 "longname=Unicode (UTF-8)" type=UTF8 label=UTF-8
+Encoding shortname=iso1 "longname=Western (ISO-8859-1)" type=ISO_8859 mapfile=1.TXT label=ISO-8859-1
+Encoding shortname=w1251 "longname=Cyrillic (Windows-1251)" type=WINDOWS mapfile=1251.TXT label=windows-1251
+Encoding shortname=w1256 "longname=Arabic (Windows-1256)" type=WINDOWS mapfile=1256.TXT label=windows-1256
+Encoding shortname=w1256 "longname=Central European (Windows-1250)" type=WINDOWS mapfile=1250.TXT label=windows-1250
+Encoding shortname=gb "longname=Chinese Simplified (GBK)" type=CJK label=GBK mapfile=gbk
+Encoding shortname=sjis "longname=Japanese (Shift-JIS)" type=CJK label=shift_jis mapfile=sjis
+Encoding shortname=koi8r "longname=Cyrillic (KOI8-R)" type=CYRILLIC mapfile=koi8_r.txt label=koi8-r
+# map       -- The name of the map file (i.e. the .ump file) for use when
+#              converting between unicode and the given encoding. The map
+#              option is mandatory for all encoding lines except the
+#              special case for utf8.
+# The following encoding is not currently supported
+# Encoding shortname=eucjp "longname=Japanese (EUC)" type=CJK label=euc-jp mapfile=jis
+# The utf8 encoding is handled internally and doesn't require a map file.
+# As a rule the utf8 encoding should always be enabled, especially if you
+# have collections of documents that may not all be in the same
+# language/encoding.
+Encoding shortname=utf-8 "longname=Unicode (UTF-8)"
+# The ISO-8859 series
+Encoding shortname=iso-8859-1 "longname=Western (ISO-8859-1)" map=8859_1.ump
+#Encoding shortname=iso-8859-2 "longname=Central European (ISO-8859-2)" map=8859_2.ump
+#Encoding shortname=iso-8859-3 "longname=Latin 3 (ISO-8859-3)" map=8859_3.ump
+#Encoding shortname=iso-8859-4 "longname=Latin 4 (ISO-8859-4)" map=8859_4.ump
+#Encoding shortname=iso-8859-5 "longname=Cyrillic (ISO-8859-5)" map=8859_5.ump
+#Encoding shortname=iso-8859-6 "longname=Arabic (ISO-8859-6)" map=8859_6.ump
+#Encoding shortname=iso-8859-7 "longname=Greek (ISO-8859-7)" map=8859_7.ump
+#Encoding shortname=iso-8859-8 "longname=Hebrew (ISO-8859-8)" map=8859_8.ump
+#Encoding shortname=iso-8859-9 "longname=Turkish (ISO-8859-9)" map=8859_9.ump
+# Windows codepages
+#Encoding shortname=windows-1250 "longname=Central European (Windows-1250)" map=win1250.ump
+#Encoding shortname=windows-1251 "longname=Cyrillic (Windows-1251)" map=win1251.ump
+#Encoding shortname=windows-1252 "longname=Western (Windows-1252)" map=win1252.ump
+#Encoding shortname=windows-1253 "longname=Greek (Windows-1253)" map=win1253.ump
+#Encoding shortname=windows-1254 "longname=Turkish (Windows-1254)" map=win1254.ump
+#Encoding shortname=windows-1255 "longname=Hebrew (Windows-1255)" map=win1255.ump
+Encoding shortname=windows-1256 "longname=Arabic (Windows-1256)" map=win1256.ump
+#Encoding shortname=windows-1257 "longname=Baltic (Windows-1257)" map=win1257.ump
+#Encoding shortname=windows-1258 "longname=Vietnamese (Windows-1258)" map=win1258.ump
+#Encoding shortname=windows-874 "longname=Thai (Windows-874)" map=win874.ump
+# KOI8 Cyrillic encodings
+#Encoding shortname=koi8-r "longname=Cyrillic (KOI8-R)" map=koi8_r.ump
+#Encoding shortname=koi8-u "longname=Cyrillic (KOI8-U)" map=koi8_u.ump
+# CJK encodings (note that Shift-JIS Japanese isn't currently supported)
+Encoding shortname=gbk "longname=Chinese Simplified (GBK)" map=gbk.ump
+Encoding shortname=big5 "longname=Chinese Traditional (Big5)" map=big5.ump
+Encoding shortname=euc-jp "longname=Japanese (EUC)" map=euc_jp.ump
+Encoding shortname=euc-kr "longname=Korean (UHC)" map=uhc.ump
 …
 #                     interface language. This should be set to the
 #                     "shortname" of a valid "Encoding" line
 Language shortname=en longname=English default_encoding=iso1
 Language shortname=fr longname=French default_encoding=iso1
 Language shortname=zh longname=Chinese default_encoding=gb
 Language shortname=de longname=German default_encoding=iso1
 Language shortname=es longname=Spanish default_encoding=iso1
 Language shortname=mi longname=Maori default_encoding=iso1
 Language shortname=ar longname=Arabic default_encoding=w1256
 Language shortname=pt longname=Portuguese default_encoding=iso1
 Language shortname=nl longname=Dutch default_encoding=iso1
+Language shortname=en longname=English default_encoding=iso-8859-1
+Language shortname=fr longname=French default_encoding=iso-8859-1
+Language shortname=zh longname=Chinese default_encoding=gbk
+Language shortname=de longname=German default_encoding=iso-8859-1
+Language shortname=es longname=Spanish default_encoding=iso-8859-1
+Language shortname=mi longname=Maori default_encoding=iso-8859-1
+Language shortname=ar longname=Arabic default_encoding=windows-1256
+Language shortname=pt longname=Portuguese default_encoding=iso-8859-1
+Language shortname=nl longname=Dutch default_encoding=iso-8859-1

trunk/gsdl/lib/gsdlunicode.cpp

-              r1310
+              r1870
 // setmapfile will cause loadmapfile to be called when conversion is
 // needed
+bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
+                 unsigned short theabsentc) {
+bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
   // check to see if the mapfile has been already loaded
+  if (mapdata.loaded && gsdlhome == thegsdlhome &&
+      encoding == theencoding && absentc == theabsentc)
+    return true;
+  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
   unloadmapfile ();
+  gsdlhome = thegsdlhome;
+  encoding = theencoding;
+  mapfile = themapfile;
   absentc = theabsentc;
 …
 // loadmapfile should be called before any conversion is done
+bool mapconvert::loadmapfile (const text_t &thegsdlhome,
+                  const text_t &theencoding,
+bool mapconvert::loadmapfile (const text_t &themapfile,
                   unsigned short theabsentc) {
   FILE *mapfilein = (FILE *)NULL;
   // check to see if the mapfile has been already loaded
+  if (mapdata.loaded && gsdlhome == thegsdlhome &&
+      encoding == theencoding && absentc == theabsentc)
+    return true;
+  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
   unloadmapfile ();
+  gsdlhome = thegsdlhome;
+  encoding = theencoding;
+  mapfile = themapfile;
   absentc = theabsentc;
   // open the map file
+  text_t filename = filename_cat (gsdlhome, "unicode");
+  filename = filename_cat (filename, encoding);
+  filename += ".ump";
+  char *cfilename = filename.getcstr();
+  char *cfilename = mapfile.getcstr();
   if (cfilename == (char *)NULL) return false;
   mapfilein = fopen(cfilename, "rb");
 …
 unsigned short mapconvert::convert (unsigned short c) {
   if (!mapdata.loaded) {
+    if (!gsdlhome.empty() && !encoding.empty() &&
+    loadmapfile (gsdlhome, encoding, absentc)) {
+    if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
       // do nothing, successfully loaded database
     } else return absentc;

trunk/gsdl/lib/gsdlunicode.h

-              r1310
+              r1870
   // setmapfile will cause loadmapfile to be called when conversion is
   // needed
+  bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
+           unsigned short theabsentc);
+  bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
   // loadmapfile should be called before any conversion is done
+  bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
+            unsigned short theabsentc);
+  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
   void unloadmapfile ();
 …
 protected:
+  text_t gsdlhome;
+  text_t encoding;
+  text_t mapfile;
   unsigned short absentc;
   mapdata_t mapdata;
 …
   // setmapfile will cause loadmapfile to be called when conversion is needed
+  bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
+            unsigned short theabsentc) {
+    return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
+  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
+    return converter.setmapfile (themapfile, theabsentc);
   };
   // loadmapfile should be called before any conversion takes
   // place
+  bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
+            unsigned short theabsentc) {
+    return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
+  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
+    return converter.loadmapfile (themapfile, theabsentc);
   };
 …
   // setmapfile will cause loadmapfile to be called when conversion is needed
+  bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
+            unsigned short theabsentc) {
+    return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
+  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
+    return converter.setmapfile (themapfile, theabsentc);
   };
   // loadmapfile should be called before any conversion takes
   // place
+  bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
+            unsigned short theabsentc) {
+    return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
+  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
+    return converter.loadmapfile (themapfile, theabsentc);
   };

trunk/gsdl/mappings/README

-              r1868
+              r1870
+This directory contains mapping files for converting various character
+encodings to and from unicode.
+This directory and its subdirectories contain .ump mapping files for
+converting various character encodings to and from unicode.
+To generate .ump files use a command like "makemapfile.pl -encoding
+encodingname -mapfile textmapfile" where encodingname becomes the filename
+of the two new .ump files and textmapfile is a plain text file containing a
+tab separated list of the form:
+x8167        0x201C
+where the first column is the hexadecimal value of the encoded character
+and the second is the hexadecimal value of it's unicode equivalent.
+The following .ump files were generated from their corresponding Microsoft
+codepages. These codepages do, in some cases, differ very slightly from the
+standards they were based on but we've used them anyway as they're so
+extensively used on the web.
+* gbk.ump: Simplified Chinese - generated from Microsoft's codepage 936
+* shiftjis.ump: Japanese - generated from Microsoft's codepage 932
+* uhc.ump: UHC Korean - generated from Microsoft's codepage 949
+* big5.ump: Traditional Chinese - generated from Microsoft's codepage 950

trunk/gsdl/perllib/doc.pm

-              r1868
+              r1870
     $self->set_utf8_metadata_element ($section, $field,
                       &unicode::ascii2utf8($value));
+                      &unicode::ascii2utf8(\$value));
+}
 …
     $self->add_utf8_metadata ($section, $field,
                   &unicode::ascii2utf8($value));
+                  &unicode::ascii2utf8(\$value));
+}
 …
     # convert the text to UTF-8 encoded unicode characters
     # and add the text
     $self->add_utf8_text($section, &unicode::ascii2utf8($text));
+    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
+}

trunk/gsdl/perllib/multiread.pm

-              r1868
+              r1870
 # utf8             - either utf8 or unicode (automatically detected)
 # unicode          - just unicode (doesn't currently do endian detection)
+# gb               - GB
+# iso_8859_[1-9]   - 8 bit extended ascii encodings
+# windows_125[0-8] - Windows codepages 1250 to 1258
+# windows 874      - Windows codepage 874
+# iscii_de         - ISCII Devanagari
+# shift_jis        - Shift-JIS
+# euc_jp           - EUC encoded Japanese
+# uhc              - Unified Hangul Code (Korean)
+#
+# plus all encodings in the "encodings" package
 package multiread;
 use unicode;
-use cjk;
 sub new {
 …
+    }
+    if ($self->{'encoding'} eq "gb") {
+    # GB or GBK
+    if ($self->{'encoding'} eq "iso_8859_1") {
+    # we'll use ascii2utf8() for this as it's faster than going
+    # through convert2unicode()
     my $line = "";
     if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8 (&cjk::gb2unicode ($line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} eq "iso_8859_1") {
+    # special case for iso_8859_1 as &ascii2utf8($line) is faster than
+    # &unicode2utf8(iso2unicode('1', $line))
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::ascii2utf8 ($line);
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} =~ /windows_(\d{3,4})$/) {
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line));
+    }
+    return undef;
+    }
+    if ($self->{'encoding'} eq "iscii_de") {
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::iscii2unicode ("Devanagari", $line));
+    }
+    return undef;
+    }
+    # unknown encoding
+        return &unicode::ascii2utf8 (\$line);
+    }
+    }
+    # everything else uses unicode::convert2unicode
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+    return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
+    }
     return undef;
+}
 …
+    }
+    if ($self->{'encoding'} eq "gb") {
+    if ($self->{'encoding'} eq "iso_8859_1") {
+    # we'll use ascii2utf8() for this as it's faster than going
+    # through convert2unicode()
     undef $/;
     my $text = <$handle>;
     $/ = "\n";
     $$outputref .= &unicode::unicode2utf8 (&cjk::gb2unicode ($text));
+    $$outputref .= &unicode::ascii2utf8 (\$text);
     return;
+    }
+    if ($self->{'encoding'} eq "iso_8859_1") {
+    # special case for iso_8859_1 as &ascii2utf8($text) is faster than
+    # &unicode2utf8(iso2unicode('1', $text))
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::ascii2utf8 ($text);
+    return;
+    }
+    if ($self->{'encoding'} eq "shift_jis") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&cjk::sjis2unicode ($text));
+    return;
+    }
+    if ($self->{'encoding'} eq "euc_jp") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&cjk::eucjp2unicode ($text));
+    return;
+    }
+    if ($self->{'encoding'} eq "euc_kr") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&cjk::euckr2unicode ($text));
+    return;
+    }
+    if ($self->{'encoding'} eq "uhc") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&cjk::uhc2unicode ($text));
+    return;
+    }
+    # if we get to here we assume it's a simple 8 bit encoding
+    # everything else uses unicode::convert2unicode
     undef $/;
     my $text = <$handle>;
     $/ = "\n";
     $$outputref .= &unicode::unicode2utf8(&unicode::simple2unicode ($self->{'encoding'}, $text));
+    $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
+}

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1868
+              r1870
 use parsargv;
 use multiread;
+use encodings;
 use cnseg;
 use acronym;
 …
 use diagnostics;
 use DateExtract;
-use iso639;
-# if textcat returns an encoding that isn't in this list
-# we'll print a warning and use the default encoding instead
-%supported_encodings = (
-            "ascii" => "",
-            "utf8" => "",
-            "iso_8859_1" => "",
-            "windows_1252" => "",
-            "iso_8859_2" => "",
-            "windows_1250" => "",
-            "iso_8859_3" => "",
-            "iso_8859_4" => "",
-            "iso_8859_5" => "",
-            "windows_1251" => "",
-            "koi8_r" => "",
-            "koi8_u" => "",
-            "iso_8859_6" => "",
-            "windows_1256" => "",
-            "iso_8859_7" => "",
-            "windows_1253" => "",
-            "iso_8859_8" => "",
-            "windows_1255" => "",
-            "iso_8859_9" => "",
-            "windows_1254" => "",
-            "gb" => "",
-            "iscii_de" => "",
-            "windows_1257" => "",
-            "windows_874" => "",
-            "windows_1258" => "",
-            "shift_jis" => "",
-            "euc_jp" => "",
-            "uhc" => ""
-            );
 sub print_general_usage {
 …
     print STDERR "                       unicode: just unicode\n";
+    print STDERR "                       iso_8859_1: Latin1 (western european languages)\n";
+    print STDERR "                       windows_1252: Windows codepage 1252 (WinLatin1)\n";
+    print STDERR "                       iso_8859_2: Latin2 (central and eastern european languages)\n";
+    print STDERR "                       windows_1250: Windows codepage 1250 (WinLatin2)\n";
+    print STDERR "                       iso_8859_3: Latin3\n";
+    print STDERR "                       iso_8859_4: Latin4\n";
+    print STDERR "                       iso_8859_5: Cyrillic\n";
+    print STDERR "                       windows_1251: Windows codepage 1251 (WinCyrillic)\n";
+    print STDERR "                       koi8_r: Cyrillic - Russian\n";
+    print STDERR "                       koi8_u: Cyrillic - Ukrainian\n";
+    print STDERR "                       iso_8859_6: Arabic\n";
+    print STDERR "                       windows_1256: Windows codepage 1256 (WinArabic)\n";
+    print STDERR "                       iso_8859_7: Greek\n";
+    print STDERR "                       windows_1253: Windows codepage 1253 (WinGreek)\n";
+    print STDERR "                       iso_8859_8: Hebrew\n";
+    print STDERR "                       windows_1255: Windows codepage 1255 (WinHebrew)\n";
+    print STDERR "                       iso_8859_9: Latin5\n";
+    print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n";
+    print STDERR "                       gb: GB or GBK simplified Chinese\n";
+    print STDERR "                       iscii_de: ISCII Devanagari\n";
+    print STDERR "                       windows_1257: Windows codepage 1257 (WinBaltic)\n";
+    print STDERR "                       windows_874: Windows codepage 874 (Thai)\n";
+    print STDERR "                       windows_1258: Windows codepage 1258 (Vietnamese)\n";
+    print STDERR "                       shift_jis: Shift-JIS (Japanese)\n";
+    print STDERR "                       euc_jp: EUC encoded Japanese\n";
+    print STDERR "                       uhc: Unified Hangul Code (Korean). This is a superset of\n";
+    print STDERR "                            EUC encoded Korean\n\n";
+    my $e = $encodings::encodings;
+    foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
+    print STDERR "                       $enc: $e->{$enc}->{'name'}\n";
+    }
     print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
 …
     my $enc = "^(";
     map {$enc .= "|$_";} keys %supported_encodings;
     my $denc = $enc . "|unicode)\$";
     $enc .= "|unicode|auto)\$";
+    map {$enc .= "|$_";} keys %$encodings::encodings;
+    my $denc = $enc . "ascii|utf8|unicode)\$";
+    $enc .= "ascii|utf8|unicode|auto)\$";
     $self->{'outhandle'} = STDERR;
 …
     my @results = textcat::classify($text);
-#    foreach $i (@results) {
-#   print STDERR "i: $i\n";
-#    }
     if (scalar @results != 1) {
 …
     # format language/encoding
     my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
+    die "Invalid language\n" if !defined $language;
+    if (!defined $language) {
+    if ($self->{'verbosity'}) {
+        print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
+        print $outhandle "defaulting to $self->{'default_language'}\n";
+    }
+    $language = $self->{'default_language'};
+    }
     if (!defined $encoding) {
+    # if textcat returned no encoding info it is assumed to be iso_8859_1
+    $encoding = "iso_8859_1";
+    }
+    if (!defined $supported_encodings{$encoding}) {
+    if ($self->{'verbosity'}) {
+        print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
+        print $outhandle "defaulting to $self->{'default_encoding'}\n";
+    }
+    $encoding = $self->{'default_encoding'};
+    }
+    if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
+    !defined $encodings::encodings->{$encoding}) {
     if ($self->{'verbosity'}) {
         print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";

trunk/gsdl/perllib/unicode.pm

-              r1868
+              r1870
 package unicode;
+%translations = ();
+use encodings;
 # ascii2unicode takes an (extended) ascii string (ISO-8859-1)
 …
+}
+# windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
+# and returns a unicode array. These encodings are similar to but not
+# identical to the corresponding ISO-8859 encodings.
+#
+# $encoding should be the code page name (e.g. '1252')
+#
+# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
+sub windows2unicode {
+    my ($encoding, $in) = @_;
+    my $out = [];
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
+                      "WINDOWS", "$encoding.TXT");
+    return $out unless &loadmapping ($encoding, $mapfile);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
+# iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
+# and returns a unicode array. This function is much like windows2unicode()
+# except that only characters >= 0xA0 are read from the mapping file (since
+# all characters below that are the same for all iso-8859 character sets
+# and therefore already the same as unicode).
+#
+# Note that while this function will work for iso-8859-1 (latin 1) it'll be
+# much faster to use ascii2unicode() or ascii2utf8()
+#
+# $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
+# encoding is
+#
+# The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
+sub iso2unicode {
+    my ($encoding, $in) = @_;
+    my $out = [];
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
+                      "ISO_8859", "$encoding.TXT");
+    return $out unless &loadmapping ($encoding, $mapfile);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
+# cyrillic2unicode is basically identical to windows2unicode, the only
+# difference being that the map files live in unicode/MAPPINGS/CYRILLIC
+#
+# values for $encoding may be 'koi8_r' or 'koi8_u'
+sub cyrillic2unicode {
+    my ($encoding, $in) = @_;
+    my $out = [];
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
+                      "CYRILLIC", "$encoding.txt");
+    return $out unless &loadmapping ($encoding, $mapfile);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
+# iscii2unicode is basically identical to iso2unicode, the only
+# difference being that the map files live in unicode/MAPPINGS/ISCII
+#
+# values for $encoding may be 'Devanagari' only at present
+sub iscii2unicode {
+    my ($encoding, $in) = @_;
+    my $out = [];
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
+                      "ISCII", "$encoding.txt");
+    return $out unless &loadmapping ($encoding, $mapfile);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
+# ascii2utf8 takes a (extended) ascii string and
+# returns a UTF-8 encoded string. This is just
+# a faster version of "&unicode2utf8(&ascii2unicode($str));"
+# ascii2utf8 takes a reference to an (extended) ascii string and returns a
+# UTF-8 encoded string. This is just a faster version of
+# "&unicode2utf8(&ascii2unicode($str));"
 sub ascii2utf8 {
     my ($in) = @_;
 …
     my ($c);
     my $i = 0;
     my $len = length($in);
     while ($i < $len) {
     $c = ord (substr ($in, $i, 1));
+    my $len = length($$in);
+    while ($i < $len) {
+    $c = ord (substr ($$in, $i, 1));
     if ($c < 0x80) {
         # ascii character
 …
     return $out;
+}
 # unicode2utf8 takes a unicode array as input and encodes it
 …
+    }
+    }
+    return $out;
+}
+    return $out;
+}
 # utf82unicode takes a utf-8 string and produces a unicode
 …
+}
 # unicode2ucs2 takes a unicode array and produces a UCS-2
 # unicode string (every two bytes forms a unicode character)
 …
     return $out;
+}
 # ucs22unicode takes a UCS-2 string and produces a unicode array
 …
+}
+# loadmapping expects the mapfile to contain (at least) two
+# tab-separated fields. The first field is the mapped value
+# and the second field is the unicode value.
+#
+# It returns 1 if successful, 0 if unsuccessful
+sub loadmapping {
+    my ($encoding, $mapfile) = @_;
+# takes a reference to a string and returns a reference to a unicode array
+sub convert2unicode {
+    my ($encoding, $textref) = @_;
+    if (!defined $encodings::encodings->{$encoding}) {
+    print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
+    return [];
+    }
+    my $encodename = "$encoding-unicode";
+    my $enc_info = $encodings::encodings->{$encoding};
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
+                      "to_uc", $enc_info->{'mapfile'});
+    if (!&loadmapencoding ($encodename, $mapfile)) {
+    print STDERR "unicode: ERROR - could not load encoding $encodename\n";
+    return [];
+    }
+    my $to = "$encoding-unicode";
+    my $from = "unicode-$encoding";
+    # check to see if the encoding has already been loaded
+    if (defined $translations{$to} && defined $translations{$from}) {
+    return 1;
+    }
+    if (!open (MAPFILE, $mapfile)) {
+    print STDERR "ERROR: unable to load mapfile $mapfile\n";
+    if (defined $enc_info->{'converter'}) {
+    my $converter = $enc_info->{'converter'};
+    return &$converter ($encodename, $textref);
+    }
+    if ($translations{$encodename}->{'count'} == 1) {
+    return &singlebyte2unicode ($encodename, $textref);
+    } else {
+    return &doublebyte2unicode ($encodename, $textref);
+    }
+}
+# singlebyte2unicode converts simple 8 bit encodings where characters below
+# 0x80 are normal ascii characters and the rest are decoded using the
+# appropriate mapping files.
+#
+# Examples of encodings that may be converted using singlebyte2unicode are
+# the iso-8859 and windows-125* series).
+sub singlebyte2unicode {
+    my ($encodename, $textref) = @_;
+    my @outtext = ();
+    my $len = length($$textref);
+    my ($c);
+    my $i = 0;
+    while ($i < $len) {
+    if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
+        # normal ascii character
+        push (@outtext, $c);
+    } else {
+        $c = &transchar ($encodename, $c);
+        # put a black square if cannot translate
+        $c = 0x25A1 if $c == 0;
+        push (@outtext, $c);
+    }
+    $i ++;
+    }
+    return \@outtext;
+}
+# doublebyte2unicode converts simple two byte encodings where characters
+# below code point 0x80 are single-byte characters and the rest are
+# double-byte characters.
+#
+# Examples of encodings that may be converted using doublebyte2unicode are
+# CJK encodings like GB encoded Chinese and UHC Korean.
+#
+# Note that no error checking is performed to make sure that the input text
+# is valid for the given encoding.
+#
+# Also, encodings that may contain characters of more than two bytes are
+# not supported (any EUC encoded text may in theory contain 3-byte
+# characters but in practice only one and two byte characters are used).
+sub doublebyte2unicode {
+    my ($encodename, $textref) = @_;
+    my @outtext = ();
+    my $len = length($$textref);
+    my ($c1, $c2);
+    my $i = 0;
+    while ($i < $len) {
+    if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
+        if ($i+1 < $len) {
+        # double-byte character
+        $c2 = ord(substr($$textref, $i+1, 1));
+        my $c = &transchar ($encodename, ($c1 << 8) | $c2);
+        # put a black square if cannot translate
+        $c = 0x25A1 if $c == 0;
+        push (@outtext, $c);
+        $i += 2;
+        } else {
+        # error
+        print STDERR "unicode: ERROR missing second half of double-byte character\n";
+        $i++;
+        }
+    } else {
+        # single-byte character
+        push (@outtext, $c1);
+        $i++;
+    }
+    }
+    return \@outtext;
+}
+# Shift-JIS to unicode
+# We can't use doublebyte2unicode for Shift-JIS because it uses some
+# single-byte characters above code point 0x80 (i.e. half-width katakana
+# characters in the range 0xA1-0xDF)
+sub shiftjis2unicode {
+    my ($encodename, $textref) = @_;
+    my @outtext = ();
+    my $len = length($$textref);
+    my ($c1, $c2);
+    my $i = 0;
+    while ($i < $len) {
+    $c1 = ord(substr($$textref, $i, 1));
+    if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
+        # Single-byte half-width katakana character or
+        # JIS Roman yen or overline characters
+        my $c = &transchar ($encodename, $c1);
+        # - put a black square if cannot translate
+        $c = 0x25A1 if $c == 0;
+        push (@outtext, $c);
+        $i++;
+    } elsif ($c1 < 0x80) {
+        # ASCII
+        push (@outtext, $c1);
+        $i ++;
+    } elsif ($c1 < 0xEF) {
+        if ($i+1 < $len) {
+        $c2 = ord(substr($$textref, $i+1, 1));
+        if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
+            # Double-byte shift-jis character
+            my $c = &transchar ($encodename, ($c1 << 8) | $c2);
+            # put a black square if cannot translate
+            $c = 0x25A1 if $c == 0;
+            push (@outtext, $c);
+        } else {
+            # error
+            print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
+        }
+        $i += 2;
+        } else {
+        # error
+        print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
+        $i ++;
+        }
+    } else {
+        # error
+        print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
+        $i ++;
+    }
+    }
+    return \@outtext;
+}
+sub transchar {
+    my ($encoding, $from) = @_;
+    my $high = ($from / 256) % 256;
+    my $low = $from % 256;
+    return 0 unless defined $translations{$encoding};
+    my $block = $translations{$encoding}->{'map'};
+    if (ref ($block->[$high]) ne "ARRAY") {
     return 0;
+    }
+    my ($line, @line);
+    $translations{$to} = {};
+    $translations{$from} = {};
+    while (defined ($line = <MAPFILE>)) {
+    # remove comments
+    $line =~ s/\#.*$//;
+    next unless $line =~ /\S/;
+    # split the line into fields and do a few
+    # simple sanity checks
+    @line = split (/\t/, $line);
+    next unless (scalar(@line) >= 2 &&
+             $line[0] =~ /^0x/ &&
+             $line[1] =~ /^0x/);
+    my $a = hex($line[0]);
+    my $b = hex($line[1]);
+    $translations{$to}->{$a} = $b;
+    $translations{$from}->{$b} = $a;
+    }
+    close (MAPFILE);
+    return 1;
+}
+####################################################################################################
+    return $block->[$high]->[$low];
+}
 # %translations is of the form:
+#
 # encodings{encodingname-encodingname}->blocktranslation
+# encodings{encodingname-encodingname}->{'map'}->blocktranslation
 # blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
+#
 …
 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
-$encodings = {
-    'iso_8859_1' => {'fullname' => 'Latin1 (western languages)',
-             'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)',
-             'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_3' => {'fullname' => 'Latin3',
-             'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_4' => {'fullname' => 'Latin4',
-             'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_5' => {'fullname' => 'Cyrillic',
-             'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_6' => {'fullname' => 'Arabic',
-             'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_7' => {'fullname' => 'Greek',
-             'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_8' => {'fullname' => 'Hebrew',
-             'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0},
-    'iso_8859_9' => {'fullname' => 'Latin5',
-             'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0},
-    'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)',
-               'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80},
-    'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)',
-               'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80},
-    'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)',
-               'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80},
-    'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',
-               'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80},
-    'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)',
-               'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80},
-    'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',
-               'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80},
-    'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',
-               'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80},
-    'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)',
-               'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80},
-    'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)',
-               'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80},
-    'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)',
-              'mapfile' => 'win874.ump', 'ascii_delim' => 0x80},
-    'koi8_r' => {'fullname' => 'Cyrillic',
-         'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80},
-    'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)',
-         'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80},
-    'iscii_de' => {'fullname' => 'ISCII Devanagari',
-           'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0}
-};
-# returns a pointer to unicode array
-sub simple2unicode {
-    my ($encoding, $intext) = @_;
-    if (!defined ($encodings->{$encoding})) {
-    print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n";
-    return [];
+    }
-    my $info = $encodings->{$encoding};
-    my $encodename = "$encoding-unicode";
-    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc",
-                      $info->{'mapfile'});
-    if (!&loadmapencoding ($encodename, $mapfile)) {
-    print STDERR "unicode: ERROR - could not load encoding $encodename\n";
-    return [];
+    }
-    my @outtext = ();
-    my $len = length($intext);
-    my ($c);
-    my $i = 0;
-    while ($i < $len) {
-    if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) {
-        # normal ascii character
-        push (@outtext, $c);
-    } else {
-        push (@outtext, &transchar ($encodename, $c));
+    }
-    $i ++;
+    }
-    return \@outtext;
+}
 # returns 1 if successful, 0 if unsuccessful
 …
     binmode (MAPFILE);
     $translations{$encoding} = [@array256];
+    $translations{$encoding} = {'map' => [@array256], 'count' => 0};
     my $block = $translations{$encoding};
 …
     while (read(MAPFILE, $in, 1) == 1) {
     $i = unpack ("C", $in);
     $block->[$i] = [@array256];
+    $block->{'map'}->[$i] = [@array256];
     for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
         my ($n1, $n2) = unpack ("CC", $in);
+        $block->[$i]->[$j] = ($n1*256) + $n2;
+    }
+        $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
+    }
+    $block->{'count'} ++;
+    }
 …
+}
-sub transchar {
-    my ($encoding, $from) = @_;
-    my $high = ($from / 256) % 256;
-    my $low = $from % 256;
-    return 0 unless defined $translations{$encoding};
-    my $block = $translations{$encoding};
-    if (ref ($block->[$high]) ne "ARRAY") {
-    return 0;
+    }
-    return $block->[$high]->[$low];
+}
 ;

trunk/gsdl/src/recpt/converter.cpp

-              r1285
+              r1870
 // the converters within converterinfo become the property of
 // of this class after add_converter has been called. The converters
 // remain the responsability of the calling code and will not be
 // deleted by this class.
+// the converters within converterinfo become the property of this class
+// after add_converter has been called. The converters remain the
+// responsability of the calling code and will not be deleted by this
+// class.
 void convertinfoclass::add_converter (const text_t &name, inconvertclass *inconverter,
                       rzwsoutconvertclass *outconverter) {

trunk/gsdl/src/recpt/converter.h

r1285	r1870
98	98	size_type size() const {return converters.size();}
99	99
	100	const_iterator find(text_t &key) {converters.find(key);}
100	101
101	102	// added functionality

trunk/gsdl/src/recpt/librarymain.cpp

-              r1860
+              r1870
 #include "mgsearch.h"
 #include "mgppsearch.h"
-#include "fileutil.h"
 #include "collectset.h"
 #include <assert.h>
 …
 #include "htmlbrowserclass.h"
 #include "phindbrowserclass.h"
-#include "recptconfig.h"
 int main () {
 …
   // add the protocol to the receptionist
   recpt.add_protocol (&nproto);
-  // z39.50 stuff - johnmcp
   // z39.50 stuff - johnmcp
 …
 #endif
-  // Read main.cfg to get all the "Encoding" lines and add corresponding converters.
-  // It might be possible to move this to somewhere like receptionist::configure, depending
-  // on whether we need the converters before then (I don't think we do).
-  text_tarray cfgline;
-  text_t maincfg = filename_cat (gsdlhome, "etc", "main.cfg");
-  if (file_exists (maincfg)) {
-    char *maincfgc = maincfg.getcstr();
-#ifdef GSDL_USE_IOS_H
-    ifstream confin (maincfgc, ios::in | ios::nocreate);
-#else
-    ifstream confin (maincfgc, ios::in);
-#endif
-    delete maincfgc;
-    if (confin) {
-      text_t subkey, subvalue, shortname;
-      text_tset saved;
-      text_tmap tmp;
-      text_t::const_iterator cfglinesub_here;
-      text_tarray::const_iterator cfgline_here;
-      text_tarray::const_iterator cfgline_end;
-      while (read_cfg_line(confin, cfgline) >= 0) {
-    if (cfgline.size () >= 4 && cfgline[0] == "Encoding") {
-      tmp.erase(tmp.begin(), tmp.end());
-      cfgline_here = cfgline.begin();
-      cfgline_end = cfgline.end();
-      while (cfgline_here != cfgline_end) {
-        cfglinesub_here = getdelimitstr((*cfgline_here).begin(),
-                        (*cfgline_here).end(), '=', subkey);
-        if (subkey == "shortname") {
-          shortname = substr (cfglinesub_here, (*cfgline_here).end());
-        } else {
-          tmp[subkey] = substr (cfglinesub_here, (*cfgline_here).end());
+        }
-        cfgline_here++;
+      }
-      // we just use the saved set to prevent multiple encodings being added
-      // that use the same shortname (i.e. any encodings after the first with
-      // the same name will be ignored).
-      if (!shortname.empty() && saved.find(shortname) == saved.end()) {
-        saved.insert(shortname);
-        if (tmp["type"] == "UTF8") {
-          utf8inconvertclass *utf8inconvert = new utf8inconvertclass();
-          utf8outconvertclass *utf8outconvert = new utf8outconvertclass();
-          recpt.add_converter (shortname, utf8inconvert, utf8outconvert);
-        } else if (tmp["type"] == "GB") {
-          mapinconvertclass *gbinconvert = new mapinconvertclass();
-          gbinconvert->setmapfile (gsdlhome, "gbku", 0x25a1);
-          mapoutconvertclass *gboutconvert = new mapoutconvertclass();
-          gboutconvert->setmapfile (gsdlhome, "ugbk", 0xa1f5);
-          recpt.add_converter (shortname, gbinconvert, gboutconvert);
-        } else {
-          if (!tmp["mapfile"].empty()) {
-        if (tmp["type"] == "ISO_8859" && tmp["mapfile"] == "1.TXT") {
-          // iso-8859-1 is a special case as it'll always be
-          // supported by the standard converter class and
-          // therefore doesn't need to use its mapping file
-          inconvertclass *inconvert = new inconvertclass();
-          rzwsoutconvertclass *outconvert = new rzwsoutconvertclass();
-          recpt.add_converter (shortname, inconvert, outconvert);
-        } else {
-          text_t mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS", tmp["type"], tmp["mapfile"]);
-          if (file_exists (mapfile)) {
-            simplemapinconvertclass *inconvert = new simplemapinconvertclass();
-            inconvert->setmapfile (mapfile);
-            simplemapoutconvertclass *outconvert = new simplemapoutconvertclass();
-            outconvert->setmapfile (mapfile);
-            recpt.add_converter (shortname, inconvert, outconvert);
+          }
+        }
+          }
+        }
+      }
+    }
+      }
-      confin.close ();
+    }
+  }
-  // add other converters
-  //  utf8inconvertclass utf8inconvert;
-  //  utf8outconvertclass utf8outconvert;
-  //  recpt.add_converter ("u", &utf8inconvert, &utf8outconvert);
-  //  mapinconvertclass gbinconvert;
-  //  gbinconvert.setmapfile (gsdlhome, "gbku", 0x25a1);
-  //  mapoutconvertclass gboutconvert;
-  //  gboutconvert.setmapfile (gsdlhome, "ugbk", 0xa1f5);
-  //  recpt.add_converter ("g", &gbinconvert, &gboutconvert);
-  // arabic
-  //  text_t mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");
-  //  mapfile = filename_cat (mapfile, "WINDOWS", "1256.TXT");
-  //  simplemapinconvertclass arinconvert;
-  //  arinconvert.setmapfile (mapfile);
-  //  simplemapoutconvertclass aroutconvert;
-  //  aroutconvert.setmapfile (mapfile);
-  //  recpt.add_converter ("a", &arinconvert, &aroutconvert);
-  // cyrillic
-  //  mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");
-  //  mapfile = filename_cat (mapfile, "WINDOWS", "1251.TXT");
-  //  simplemapinconvertclass cyinconvert;
-  //  cyinconvert.setmapfile (mapfile);
-  //  simplemapoutconvertclass cyoutconvert;
-  //  cyoutconvert.setmapfile (mapfile);
-  //  recpt.add_converter ("c", &cyinconvert, &cyoutconvert);
-  // hindi
-//    armapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");
-//    armapfile = filename_cat (armapfile, "ISCII", "Devanagari.txt");
-//    simplemapinconvertclass arinconvert;
-//    arinconvert.setmapfile (armapfile);
-//    simplemapoutconvertclass aroutconvert;
-//    aroutconvert.setmapfile (armapfile);
-//    recpt.add_converter ("a", &arinconvert, &aroutconvert);
   // the list of actions. Note: these actions will become invalid
   // at the end of this function.

trunk/gsdl/src/recpt/pageaction.cpp

-              r1861
+              r1870
+      }
-      if (pref_langs.find("zh") == pref_langs.end())
-    disp.setmacro ("encodingoption", "preferences", "");
     } else {
       while (tlang != elang) {
 …
     // create the "encoding" selection box for the preferences page
     text_t &arg_w = args["w"];
+    // put encodings in another map to sort them by longname
+    text_tmap encodings;
+    encodinginfo_tmap::const_iterator thisenc = configinfo.encodings.begin();
+    encodinginfo_tmap::const_iterator endenc = configinfo.encodings.end();
+    text_t encodingoption;
+    text_tmap::const_iterator thisenc = configinfo.encodings.begin();
+    text_tmap::const_iterator endenc = configinfo.encodings.end();
     while (thisenc != endenc) {
+      encodings[(*thisenc).second.longname] = (*thisenc).first;
+      thisenc++;
+    }
+    text_tmap::iterator tenc = encodings.begin();
+    text_tmap::iterator eenc = encodings.end();
+    text_t encodingoption;
+    while (tenc != eenc) {
+      encodingoption += "<option value=\"" + (*tenc).second + "\"";
+      if ((*tenc).second == arg_w) encodingoption += " selected";
+      encodingoption += ">" + (*tenc).first + "\n";
+      tenc ++;
+      encodingoption += "<option value=\"" + (*thisenc).second + "\"";
+      if ((*thisenc).second == arg_w) encodingoption += " selected";
+      encodingoption += ">" + (*thisenc).first + "\n";
+      thisenc ++;
+    }

trunk/gsdl/src/recpt/receptionist.cpp

-              r1861
+              r1870
+}
-void encodinginfo_t::clear () {
-  longname.clear();
-  label.clear();
+}
 receptionist::receptionist () {
   // create a list of cgi arguments
 …
 void receptionist::configure (const text_t &key, const text_tarray &cfgline) {
   // configure the receptionist
   if (cfgline.size() >= 1) {
     cgiarginfo *info = NULL;
 …
     } else if (key == "Encoding") {
+      text_t subkey, subvalue;
+      text_t shortname, longname, label;
+      configure_encoding (cfgline);
+    } else if (key == "Language") {
+      text_t subkey, subvalue, shortname;
+      languageinfo_t lang;
       text_t::const_iterator cfglinesub_here;
       text_tarray::const_iterator cfgline_here = cfgline.begin();
 …
       shortname = substr (cfglinesub_here, (*cfgline_here).end());
     } else if (subkey == "longname") {
+      longname = substr (cfglinesub_here, (*cfgline_here).end());
+    } else if (subkey == "label") {
+      label = substr (cfglinesub_here, (*cfgline_here).end());
+    }
+    cfgline_here++;
+      }
+      if (!shortname.empty() && !label.empty()) {
+    encodinginfo_t enc;
+    if (longname.empty()) enc.longname = shortname;
+    else enc.longname = longname;
+    enc.label = label;
+    configinfo.encodings[shortname] = enc;
+      }
+    } else if (key == "Language") {
+      text_t subkey, subvalue;
+      text_t shortname, longname, defaultencoding;
+      text_t::const_iterator cfglinesub_here;
+      text_tarray::const_iterator cfgline_here = cfgline.begin();
+      text_tarray::const_iterator cfgline_end = cfgline.end();
+      while (cfgline_here != cfgline_end) {
+    cfglinesub_here = getdelimitstr((*cfgline_here).begin(),
+                    (*cfgline_here).end(), '=', subkey);
+    if (subkey == "shortname") {
+      shortname = substr (cfglinesub_here, (*cfgline_here).end());
+    } else if (subkey == "longname") {
+      longname = substr (cfglinesub_here, (*cfgline_here).end());
+      lang.longname = substr (cfglinesub_here, (*cfgline_here).end());
     } else if (subkey == "default_encoding") {
       defaultencoding = substr (cfglinesub_here, (*cfgline_here).end());
+      lang.defaultencoding = substr (cfglinesub_here, (*cfgline_here).end());
+    }
     cfgline_here++;
+      }
       if (!shortname.empty()) {
+    languageinfo_t lang;
+    if (longname.empty()) lang.longname = shortname;
+    else lang.longname = longname;
+    lang.defaultencoding = defaultencoding;
+    if (lang.longname.empty()) lang.longname = shortname;
     configinfo.languages[shortname] = lang;
+      }
 …
+// init should be called after all the actions, protocols, and
+// converters have been added to the receptionist and after everything
+// has been configured but before any pages are created.
+// It returns true on success and false on failure. If false is
+// returned getpage should not be called (without producing
+// meaningless output), instead an error page should be
+// produced by the calling code.
+// init should be called after all the actions and protocols have been
+// added to the receptionist and after everything has been configured but
+// before any pages are created.  It returns true on success and false on
+// failure. If false is returned getpage should not be called (without
+// producing meaningless output), instead an error page should be produced
+// by the calling code.
 bool receptionist::init (ostream &logout) {
   // first configure collectdir
   text_t thecollectdir = configinfo.gsdlhome;
 …
   srand (time(NULL));
-  // make the output converters remove all the zero-width spaces
-  convertinfoclass::iterator converthere = converters.begin ();
-  convertinfoclass::iterator convertend = converters.end ();
-  while (converthere != convertend) {
-    assert ((*converthere).second.outconverter != NULL);
-    if ((*converthere).second.outconverter != NULL) {
-      (*converthere).second.outconverter->set_rzws(1);
+    }
-    converthere++;
+  }
   // if maintainer email address is something dodgy (for now I'll define
   // dodgy as being anything that doesn't contain '@') disable EmailEvents
 …
   // make sure the encoding is valid
   if (configinfo.encodings.find(default_encoding) == configinfo.encodings.end()) return "";
+  if (converters.find(default_encoding) == converters.end()) return "";
   return default_encoding;
 …
   // add the encoding information
   if (response == content) {
     if (configinfo.encodings.find(args["w"]) != configinfo.encodings.end()) {
       response_data += "; charset=" + configinfo.encodings[args["w"]].label;
+    if (converters.find(args["w"]) != converters.end()) {
+      response_data += "; charset=" + args["w"];
     } else {
       // default to latin 1
 …
+}
+// Handles an "Encoding" line from a configuration file - note that the
+// configinfo.encodings map is a bit of a hack (to be fixed when the
+// configuration files are tidied up).
+void receptionist::configure_encoding (const text_tarray &cfgline) {
+  text_t subkey, subvalue, shortname, longname, mapfile;
+  text_t::const_iterator cfglinesub_here;
+  text_tarray::const_iterator cfgline_here = cfgline.begin();
+  text_tarray::const_iterator cfgline_end = cfgline.end();
+  while (cfgline_here != cfgline_end) {
+    cfglinesub_here = getdelimitstr((*cfgline_here).begin(),
+                    (*cfgline_here).end(), '=', subkey);
+    if (subkey == "shortname") {
+      shortname = substr (cfglinesub_here, (*cfgline_here).end());
+    } else if (subkey == "longname") {
+      longname = substr (cfglinesub_here, (*cfgline_here).end());
+    } else if (subkey == "map") {
+      mapfile = substr (cfglinesub_here, (*cfgline_here).end());
+    }
+    cfgline_here++;
+  }
+  if (!shortname.empty()) {
+    if (longname.empty()) longname = shortname;
+    // add the converter
+    if (shortname == "utf-8") {
+      utf8inconvertclass *utf8inconvert = new utf8inconvertclass();
+      utf8outconvertclass *utf8outconvert = new utf8outconvertclass();
+      utf8outconvert->set_rzws(1);
+      add_converter (shortname, utf8inconvert, utf8outconvert);
+      configinfo.encodings[longname] = shortname;
+    } else if (!mapfile.empty()) {
+      if (mapfile == "8859_1.ump") {
+    // iso-8859-1 is a special case as it'll always be supported by the
+    // standard converter class and therefore doesn't need to use its
+    // mapping file
+    inconvertclass *inconvert = new inconvertclass();
+    rzwsoutconvertclass *outconvert = new rzwsoutconvertclass();
+    outconvert->set_rzws(1);
+    add_converter (shortname, inconvert, outconvert);
+    configinfo.encodings[longname] = shortname;
+      } else {
+    text_t to_uc_map = filename_cat(configinfo.gsdlhome, "mappings", "to_uc", mapfile);
+    text_t from_uc_map = filename_cat(configinfo.gsdlhome, "mappings", "from_uc", mapfile);
+    if (file_exists(to_uc_map) && file_exists(from_uc_map)) {
+      mapinconvertclass *mapinconvert = new mapinconvertclass();
+      mapinconvert->setmapfile (to_uc_map, 0x003F);
+      mapoutconvertclass *mapoutconvert = new mapoutconvertclass();
+      mapoutconvert->setmapfile (from_uc_map, 0x3F);
+      mapoutconvert->set_rzws(1);
+      add_converter (shortname, mapinconvert, mapoutconvert);
+      configinfo.encodings[longname] = shortname;
+    }
+      }
+    }
+  }
+}

trunk/gsdl/src/recpt/receptionist.h

-              r1860
+              r1870
 };
-struct encodinginfo_t {
-  void clear();
-  encodinginfo_t () {clear();}
-  text_t longname;
-  text_t label;
-};
 typedef map<text_t, collectioninfo_t, lttext_t> colinfo_tmap;
 typedef map<text_t, languageinfo_t, lttext_t> languageinfo_tmap;
-typedef map<text_t, encodinginfo_t, lttext_t> encodinginfo_tmap;
 enum events_t {Disabled, CollectorEvents, AllEvents};
 …
   languageinfo_tmap languages;
+  encodinginfo_tmap encodings;
+  // encodings is just a simple mapping from encoding longnames to
+  // shortnames.  It's useful for now for creating the pulldown menu of
+  // encodings on the preferences page but isn't intended to be permanent.
+  text_tmap encodings;
   void clear ();
 …
   bool append_logstr (const text_t &filename, const text_t &logstr,
               ostream &logout);
+  void configure_encoding (const text_tarray &cfgline);
 public:

Context Navigation

Legend:

trunk/gsdl/bin/script/makemapfile.pl

trunk/gsdl/etc/main.cfg

trunk/gsdl/lib/gsdlunicode.cpp

trunk/gsdl/lib/gsdlunicode.h

trunk/gsdl/mappings/README

trunk/gsdl/perllib/doc.pm

trunk/gsdl/perllib/multiread.pm

trunk/gsdl/perllib/plugins/BasPlug.pm

trunk/gsdl/perllib/unicode.pm

trunk/gsdl/src/recpt/converter.cpp

trunk/gsdl/src/recpt/converter.h

trunk/gsdl/src/recpt/librarymain.cpp

trunk/gsdl/src/recpt/pageaction.cpp

trunk/gsdl/src/recpt/receptionist.cpp

trunk/gsdl/src/recpt/receptionist.h

Download in other formats: