Changeset 1870
- Timestamp:
- 2001-01-29T14:54:58+13:00 (23 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 10 added
- 3 deleted
- 17 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/makemapfile.pl
r1868 r1870 26 26 ########################################################################### 27 27 28 # Creates a binary map file for use by complex character encodings29 # (e.g. CJK encodings like GBK and Shift-JIS). The map file is written to30 # the $GSDLHOME/unicode directory.31 32 28 BEGIN { 33 29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; … … 37 33 38 34 use parsargv; 39 use cjk; 35 use util; 36 37 # %translations is of the form: 38 # 39 # encodings{encodingname-encodingname}->blocktranslation 40 # blocktranslation->[[0-255],[256-511], ..., [65280-65535]] 41 # 42 # Any of the top translation blocks can point to an undefined 43 # value. This data structure aims to allow fast translation and 44 # efficient storage. 45 %translations = (); 46 47 # @array256 is used for initialisation, there must be 48 # a better way... 49 @array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 52 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 53 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 54 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 55 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 61 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 62 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 63 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 64 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); 65 40 66 &main(); 41 67 … … 55 81 } 56 82 57 &cjk::makeencodingmapfile ($encoding, $mapfile); 83 if (!&loadencoding ($encoding, $mapfile)) { 84 die "couldn't load encoding $encoding"; 85 } 86 87 # write out map files 88 &writemapfile ("$encoding-unicode", $encoding, 1); 89 &writemapfile ("unicode-$encoding", $encoding, 0); 58 90 } 91 92 sub writemapfile { 93 my ($encoding, $filename, $tounicode) = @_; 94 95 $filename .= ".ump"; # unicode map file 96 if ($tounicode) { 97 $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename); 98 } else { 99 $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename); 100 } 101 102 die "translation not defined" if (!defined $translations{$encoding}); 103 my $block = $translations{$encoding}; 104 105 print "writing $filename\n"; 106 open (MAPFILE, ">" . $filename) || die; 107 binmode (MAPFILE); 108 109 my ($i, $j); 110 for ($i=0; $i<256; $i++) { 111 if (ref ($block->[$i]) eq "ARRAY") { 112 print MAPFILE pack ("C", $i); 113 for ($j=0; $j<256; $j++) { 114 # unsigned short in network order 115 print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256), 116 $block->[$i]->[$j] % 256); 117 } 118 } 119 } 120 close (MAPFILE); 121 } 122 123 # loadencoding expects the mapfile to contain (at least) two 124 # tab-separated fields. The first field is the mapped value 125 # and the second field is the unicode value. 126 # 127 # It returns 1 if successful, 0 if unsuccessful 128 sub loadencoding { 129 my ($encoding, $mapfile) = @_; 130 131 my $to = "$encoding-unicode"; 132 my $from = "unicode-$encoding"; 133 134 # check to see if the encoding has already been loaded 135 if (defined $translations{$to} && defined $translations{$from}) { 136 return 1; 137 } 138 139 return 0 unless open (MAPFILE, $mapfile); 140 141 my ($line, @line); 142 $translations{$to} = [@array256]; 143 $translations{$from} = [@array256]; 144 while (defined ($line = <MAPFILE>)) { 145 chomp $line; 146 # remove comments 147 $line =~ s/\#.*$//; 148 next unless $line =~ /\S/; 149 150 # split the line into fields and do a few 151 # simple sanity checks 152 @line = split (/\t/, $line); 153 next unless (scalar(@line) >= 2 && 154 $line[0] =~ /^0x/ && 155 $line[1] =~ /^0x/); 156 157 my $char = hex($line[0]); 158 my $unic = hex($line[1]); 159 160 # might need this for some versions of gb but not gbk 161 # $char = $char | 0x8080 unless ($encoding =~ /gbk/i); 162 163 &addchartrans ($translations{$to}, $char, $unic); 164 &addchartrans ($translations{$from}, $unic, $char); 165 } 166 167 close (MAPFILE); 168 169 return 1; 170 } 171 172 # addchartrans adds one character translation to a translation block. 173 # It also simplifies the translation block if possible. 174 sub addchartrans { 175 my ($block, $from, $to) = @_; 176 my $i = 0; 177 178 my $high = ($from / 256) % 256; 179 my $low = $from % 256; 180 181 if (ref ($block->[$high]) ne "ARRAY") { 182 $block->[$high] = [@array256]; 183 } 184 $block->[$high]->[$low] = $to; 185 } -
trunk/gsdl/etc/main.cfg
r1868 r1870 78 78 # Define the interface languages and encodings supported by this receptionist 79 79 80 # An "Encoding" line defines an encoding to be used by the receptionist 81 # options are: 82 # shortname -- Identifier for the given encoding. The shortname option is 83 # mandatory and must be unique for each "Encoding" line. 80 # An "Encoding" line defines an encoding to be used by the receptionist. 81 # Uncomment "Encoding" lines to include an encoding on your "preferences" page. 82 # Encoding line options are: 83 # shortname -- The standard charset label for the given encoding. The 84 # shortname option is mandatory. 84 85 # longname -- The display name of the given encoding. If longname isn't set 85 86 # it will default to using shortname instead. 86 # type -- The type of encoding. Note that for most encodings this 87 # value is the directory name under which the map file for 88 # this encoding resides in the Greenstone unicode/MAPPINGS 89 # directory (e.g. 'WINDOWS', 'ISO_8859' etc.). It may also 90 # take the values 'CJK' and 'UTF8'. 91 # mapfile -- The name of the map file for use when converting between 92 # utf8 and the given encoding. The mapfile option is mandatory 93 # for all encoding types with the exception of UTF8. If type 94 # is CJK, mapfile is the abbreviated name of the encoding as 95 # used by the binary mapping files (.ump files). i.e. if the 96 # encoding uses the map files gbku.ump and ugbk.ump, mapfile 97 # will be set to "gbk". 98 # label -- The standard label to which you must set the value of 99 # "charset" within http headers or html meta tags to get a web 100 # browser to use the given encoding. The label option is 101 # mandatory. 102 Encoding shortname=utf8 "longname=Unicode (UTF-8)" type=UTF8 label=UTF-8 103 Encoding shortname=iso1 "longname=Western (ISO-8859-1)" type=ISO_8859 mapfile=1.TXT label=ISO-8859-1 104 Encoding shortname=w1251 "longname=Cyrillic (Windows-1251)" type=WINDOWS mapfile=1251.TXT label=windows-1251 105 Encoding shortname=w1256 "longname=Arabic (Windows-1256)" type=WINDOWS mapfile=1256.TXT label=windows-1256 106 Encoding shortname=w1256 "longname=Central European (Windows-1250)" type=WINDOWS mapfile=1250.TXT label=windows-1250 107 Encoding shortname=gb "longname=Chinese Simplified (GBK)" type=CJK label=GBK mapfile=gbk 108 Encoding shortname=sjis "longname=Japanese (Shift-JIS)" type=CJK label=shift_jis mapfile=sjis 109 Encoding shortname=koi8r "longname=Cyrillic (KOI8-R)" type=CYRILLIC mapfile=koi8_r.txt label=koi8-r 87 # map -- The name of the map file (i.e. the .ump file) for use when 88 # converting between unicode and the given encoding. The map 89 # option is mandatory for all encoding lines except the 90 # special case for utf8. 110 91 111 # The following encoding is not currently supported 112 # Encoding shortname=eucjp "longname=Japanese (EUC)" type=CJK label=euc-jp mapfile=jis 92 # The utf8 encoding is handled internally and doesn't require a map file. 93 # As a rule the utf8 encoding should always be enabled, especially if you 94 # have collections of documents that may not all be in the same 95 # language/encoding. 96 Encoding shortname=utf-8 "longname=Unicode (UTF-8)" 97 98 # The ISO-8859 series 99 Encoding shortname=iso-8859-1 "longname=Western (ISO-8859-1)" map=8859_1.ump 100 #Encoding shortname=iso-8859-2 "longname=Central European (ISO-8859-2)" map=8859_2.ump 101 #Encoding shortname=iso-8859-3 "longname=Latin 3 (ISO-8859-3)" map=8859_3.ump 102 #Encoding shortname=iso-8859-4 "longname=Latin 4 (ISO-8859-4)" map=8859_4.ump 103 #Encoding shortname=iso-8859-5 "longname=Cyrillic (ISO-8859-5)" map=8859_5.ump 104 #Encoding shortname=iso-8859-6 "longname=Arabic (ISO-8859-6)" map=8859_6.ump 105 #Encoding shortname=iso-8859-7 "longname=Greek (ISO-8859-7)" map=8859_7.ump 106 #Encoding shortname=iso-8859-8 "longname=Hebrew (ISO-8859-8)" map=8859_8.ump 107 #Encoding shortname=iso-8859-9 "longname=Turkish (ISO-8859-9)" map=8859_9.ump 108 109 # Windows codepages 110 #Encoding shortname=windows-1250 "longname=Central European (Windows-1250)" map=win1250.ump 111 #Encoding shortname=windows-1251 "longname=Cyrillic (Windows-1251)" map=win1251.ump 112 #Encoding shortname=windows-1252 "longname=Western (Windows-1252)" map=win1252.ump 113 #Encoding shortname=windows-1253 "longname=Greek (Windows-1253)" map=win1253.ump 114 #Encoding shortname=windows-1254 "longname=Turkish (Windows-1254)" map=win1254.ump 115 #Encoding shortname=windows-1255 "longname=Hebrew (Windows-1255)" map=win1255.ump 116 Encoding shortname=windows-1256 "longname=Arabic (Windows-1256)" map=win1256.ump 117 #Encoding shortname=windows-1257 "longname=Baltic (Windows-1257)" map=win1257.ump 118 #Encoding shortname=windows-1258 "longname=Vietnamese (Windows-1258)" map=win1258.ump 119 #Encoding shortname=windows-874 "longname=Thai (Windows-874)" map=win874.ump 120 121 # KOI8 Cyrillic encodings 122 #Encoding shortname=koi8-r "longname=Cyrillic (KOI8-R)" map=koi8_r.ump 123 #Encoding shortname=koi8-u "longname=Cyrillic (KOI8-U)" map=koi8_u.ump 124 125 # CJK encodings (note that Shift-JIS Japanese isn't currently supported) 126 Encoding shortname=gbk "longname=Chinese Simplified (GBK)" map=gbk.ump 127 Encoding shortname=big5 "longname=Chinese Traditional (Big5)" map=big5.ump 128 Encoding shortname=euc-jp "longname=Japanese (EUC)" map=euc_jp.ump 129 Encoding shortname=euc-kr "longname=Korean (UHC)" map=uhc.ump 113 130 114 131 … … 126 143 # interface language. This should be set to the 127 144 # "shortname" of a valid "Encoding" line 128 Language shortname=en longname=English default_encoding=iso 1129 Language shortname=fr longname=French default_encoding=iso 1130 Language shortname=zh longname=Chinese default_encoding=gb 131 Language shortname=de longname=German default_encoding=iso 1132 Language shortname=es longname=Spanish default_encoding=iso 1133 Language shortname=mi longname=Maori default_encoding=iso 1134 Language shortname=ar longname=Arabic default_encoding=w 1256135 Language shortname=pt longname=Portuguese default_encoding=iso 1136 Language shortname=nl longname=Dutch default_encoding=iso 1145 Language shortname=en longname=English default_encoding=iso-8859-1 146 Language shortname=fr longname=French default_encoding=iso-8859-1 147 Language shortname=zh longname=Chinese default_encoding=gbk 148 Language shortname=de longname=German default_encoding=iso-8859-1 149 Language shortname=es longname=Spanish default_encoding=iso-8859-1 150 Language shortname=mi longname=Maori default_encoding=iso-8859-1 151 Language shortname=ar longname=Arabic default_encoding=windows-1256 152 Language shortname=pt longname=Portuguese default_encoding=iso-8859-1 153 Language shortname=nl longname=Dutch default_encoding=iso-8859-1 137 154 138 155 -
trunk/gsdl/lib/gsdlunicode.cpp
r1310 r1870 263 263 // setmapfile will cause loadmapfile to be called when conversion is 264 264 // needed 265 bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding, 266 unsigned short theabsentc) { 265 bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) { 267 266 // check to see if the mapfile has been already loaded 268 if (mapdata.loaded && gsdlhome == thegsdlhome && 269 encoding == theencoding && absentc == theabsentc) 270 return true; 267 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true; 271 268 272 269 unloadmapfile (); 273 gsdlhome = thegsdlhome; 274 encoding = theencoding; 270 mapfile = themapfile; 275 271 absentc = theabsentc; 276 272 … … 281 277 282 278 // loadmapfile should be called before any conversion is done 283 bool mapconvert::loadmapfile (const text_t &thegsdlhome, 284 const text_t &theencoding, 279 bool mapconvert::loadmapfile (const text_t &themapfile, 285 280 unsigned short theabsentc) { 286 281 FILE *mapfilein = (FILE *)NULL; 287 282 288 283 // check to see if the mapfile has been already loaded 289 if (mapdata.loaded && gsdlhome == thegsdlhome && 290 encoding == theencoding && absentc == theabsentc) 291 return true; 284 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true; 292 285 293 286 unloadmapfile (); 294 gsdlhome = thegsdlhome; 295 encoding = theencoding; 287 mapfile = themapfile; 296 288 absentc = theabsentc; 297 289 298 290 // open the map file 299 text_t filename = filename_cat (gsdlhome, "unicode"); 300 filename = filename_cat (filename, encoding); 301 filename += ".ump"; 302 char *cfilename = filename.getcstr(); 291 char *cfilename = mapfile.getcstr(); 303 292 if (cfilename == (char *)NULL) return false; 304 293 mapfilein = fopen(cfilename, "rb"); … … 359 348 unsigned short mapconvert::convert (unsigned short c) { 360 349 if (!mapdata.loaded) { 361 if (!gsdlhome.empty() && !encoding.empty() && 362 loadmapfile (gsdlhome, encoding, absentc)) { 350 if (!mapfile.empty() && loadmapfile (mapfile, absentc)) { 363 351 // do nothing, successfully loaded database 364 352 } else return absentc; -
trunk/gsdl/lib/gsdlunicode.h
r1310 r1870 114 114 // setmapfile will cause loadmapfile to be called when conversion is 115 115 // needed 116 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding, 117 unsigned short theabsentc); 116 bool setmapfile (const text_t &themapfile, unsigned short theabsentc); 118 117 119 118 // loadmapfile should be called before any conversion is done 120 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding, 121 unsigned short theabsentc); 119 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc); 122 120 void unloadmapfile (); 123 121 … … 129 127 130 128 protected: 131 text_t gsdlhome; 132 text_t encoding; 129 text_t mapfile; 133 130 unsigned short absentc; 134 131 mapdata_t mapdata; … … 146 143 147 144 // setmapfile will cause loadmapfile to be called when conversion is needed 148 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding, 149 unsigned short theabsentc) { 150 return converter.setmapfile (thegsdlhome, theencoding, theabsentc); 145 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) { 146 return converter.setmapfile (themapfile, theabsentc); 151 147 }; 152 148 153 149 // loadmapfile should be called before any conversion takes 154 150 // place 155 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding, 156 unsigned short theabsentc) { 157 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc); 151 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) { 152 return converter.loadmapfile (themapfile, theabsentc); 158 153 }; 159 154 … … 189 184 190 185 // setmapfile will cause loadmapfile to be called when conversion is needed 191 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding, 192 unsigned short theabsentc) { 193 return converter.setmapfile (thegsdlhome, theencoding, theabsentc); 186 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) { 187 return converter.setmapfile (themapfile, theabsentc); 194 188 }; 195 189 196 190 // loadmapfile should be called before any conversion takes 197 191 // place 198 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding, 199 unsigned short theabsentc) { 200 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc); 192 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) { 193 return converter.loadmapfile (themapfile, theabsentc); 201 194 }; 202 195 -
trunk/gsdl/mappings/README
r1868 r1870 1 This directory contains mapping files for converting various character 2 encodings to and from unicode. 1 This directory and its subdirectories contain .ump mapping files for 2 converting various character encodings to and from unicode. 3 4 To generate .ump files use a command like "makemapfile.pl -encoding 5 encodingname -mapfile textmapfile" where encodingname becomes the filename 6 of the two new .ump files and textmapfile is a plain text file containing a 7 tab separated list of the form: 8 0x8167 0x201C 9 where the first column is the hexadecimal value of the encoded character 10 and the second is the hexadecimal value of it's unicode equivalent. 11 12 13 14 The following .ump files were generated from their corresponding Microsoft 15 codepages. These codepages do, in some cases, differ very slightly from the 16 standards they were based on but we've used them anyway as they're so 17 extensively used on the web. 18 19 * gbk.ump: Simplified Chinese - generated from Microsoft's codepage 936 20 * shiftjis.ump: Japanese - generated from Microsoft's codepage 932 21 * uhc.ump: UHC Korean - generated from Microsoft's codepage 949 22 * big5.ump: Traditional Chinese - generated from Microsoft's codepage 950 -
trunk/gsdl/perllib/doc.pm
r1868 r1870 694 694 695 695 $self->set_utf8_metadata_element ($section, $field, 696 &unicode::ascii2utf8( $value));696 &unicode::ascii2utf8(\$value)); 697 697 } 698 698 … … 716 716 717 717 $self->add_utf8_metadata ($section, $field, 718 &unicode::ascii2utf8( $value));718 &unicode::ascii2utf8(\$value)); 719 719 } 720 720 … … 799 799 # convert the text to UTF-8 encoded unicode characters 800 800 # and add the text 801 $self->add_utf8_text($section, &unicode::ascii2utf8( $text));801 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text)); 802 802 } 803 803 -
trunk/gsdl/perllib/multiread.pm
r1868 r1870 28 28 # utf8 - either utf8 or unicode (automatically detected) 29 29 # unicode - just unicode (doesn't currently do endian detection) 30 # gb - GB 31 # iso_8859_[1-9] - 8 bit extended ascii encodings 32 # windows_125[0-8] - Windows codepages 1250 to 1258 33 # windows 874 - Windows codepage 874 34 # iscii_de - ISCII Devanagari 35 # shift_jis - Shift-JIS 36 # euc_jp - EUC encoded Japanese 37 # uhc - Unified Hangul Code (Korean) 30 # 31 # plus all encodings in the "encodings" package 38 32 39 33 package multiread; 40 34 41 35 use unicode; 42 use cjk;43 36 44 37 sub new { … … 203 196 } 204 197 205 if ($self->{'encoding'} eq "gb") { 206 # GB or GBK 198 if ($self->{'encoding'} eq "iso_8859_1") { 199 # we'll use ascii2utf8() for this as it's faster than going 200 # through convert2unicode() 207 201 my $line = ""; 208 202 if (defined ($line = <$handle>)) { 209 return &unicode::unicode2utf8 (&cjk::gb2unicode ($line)); 210 } 211 return undef; 212 } 213 214 if ($self->{'encoding'} eq "iso_8859_1") { 215 # special case for iso_8859_1 as &ascii2utf8($line) is faster than 216 # &unicode2utf8(iso2unicode('1', $line)) 217 my $line = ""; 218 if (defined ($line = <$handle>)) { 219 return &unicode::ascii2utf8 ($line); 220 } 221 return undef; 222 } 223 224 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) { 225 my $line = ""; 226 if (defined ($line = <$handle>)) { 227 return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line)); 228 } 229 return undef; 230 } 231 232 if ($self->{'encoding'} =~ /windows_(\d{3,4})$/) { 233 my $line = ""; 234 if (defined ($line = <$handle>)) { 235 return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line)); 236 } 237 return undef; 238 } 239 240 if ($self->{'encoding'} =~ /^koi8_[ru]$/) { 241 my $line = ""; 242 if (defined ($line = <$handle>)) { 243 return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line)); 244 } 245 return undef; 246 } 247 248 if ($self->{'encoding'} eq "iscii_de") { 249 my $line = ""; 250 if (defined ($line = <$handle>)) { 251 return &unicode::unicode2utf8(&unicode::iscii2unicode ("Devanagari", $line)); 252 } 253 return undef; 254 } 255 256 # unknown encoding 203 return &unicode::ascii2utf8 (\$line); 204 } 205 } 206 207 # everything else uses unicode::convert2unicode 208 my $line = ""; 209 if (defined ($line = <$handle>)) { 210 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line)); 211 } 212 257 213 return undef; 258 214 } … … 292 248 } 293 249 294 if ($self->{'encoding'} eq "gb") { 250 if ($self->{'encoding'} eq "iso_8859_1") { 251 # we'll use ascii2utf8() for this as it's faster than going 252 # through convert2unicode() 295 253 undef $/; 296 254 my $text = <$handle>; 297 255 $/ = "\n"; 298 $$outputref .= &unicode:: unicode2utf8 (&cjk::gb2unicode ($text));256 $$outputref .= &unicode::ascii2utf8 (\$text); 299 257 return; 300 258 } 301 302 if ($self->{'encoding'} eq "iso_8859_1") { 303 # special case for iso_8859_1 as &ascii2utf8($text) is faster than 304 # &unicode2utf8(iso2unicode('1', $text)) 305 undef $/; 306 my $text = <$handle>; 307 $/ = "\n"; 308 $$outputref .= &unicode::ascii2utf8 ($text); 309 return; 310 } 311 312 if ($self->{'encoding'} eq "shift_jis") { 313 undef $/; 314 my $text = <$handle>; 315 $/ = "\n"; 316 $$outputref .= &unicode::unicode2utf8(&cjk::sjis2unicode ($text)); 317 return; 318 } 319 320 if ($self->{'encoding'} eq "euc_jp") { 321 undef $/; 322 my $text = <$handle>; 323 $/ = "\n"; 324 $$outputref .= &unicode::unicode2utf8(&cjk::eucjp2unicode ($text)); 325 return; 326 } 327 328 if ($self->{'encoding'} eq "euc_kr") { 329 undef $/; 330 my $text = <$handle>; 331 $/ = "\n"; 332 $$outputref .= &unicode::unicode2utf8(&cjk::euckr2unicode ($text)); 333 return; 334 } 335 336 if ($self->{'encoding'} eq "uhc") { 337 undef $/; 338 my $text = <$handle>; 339 $/ = "\n"; 340 $$outputref .= &unicode::unicode2utf8(&cjk::uhc2unicode ($text)); 341 return; 342 } 343 344 # if we get to here we assume it's a simple 8 bit encoding 259 260 # everything else uses unicode::convert2unicode 345 261 undef $/; 346 262 my $text = <$handle>; 347 263 $/ = "\n"; 348 $$outputref .= &unicode::unicode2utf8 (&unicode::simple2unicode ($self->{'encoding'},$text));264 $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text)); 349 265 } 350 266 -
trunk/gsdl/perllib/plugins/BasPlug.pm
r1868 r1870 28 28 use parsargv; 29 29 use multiread; 30 use encodings; 30 31 use cnseg; 31 32 use acronym; … … 34 35 use diagnostics; 35 36 use DateExtract; 36 use iso639;37 38 # if textcat returns an encoding that isn't in this list39 # we'll print a warning and use the default encoding instead40 %supported_encodings = (41 "ascii" => "",42 "utf8" => "",43 "iso_8859_1" => "",44 "windows_1252" => "",45 "iso_8859_2" => "",46 "windows_1250" => "",47 "iso_8859_3" => "",48 "iso_8859_4" => "",49 "iso_8859_5" => "",50 "windows_1251" => "",51 "koi8_r" => "",52 "koi8_u" => "",53 "iso_8859_6" => "",54 "windows_1256" => "",55 "iso_8859_7" => "",56 "windows_1253" => "",57 "iso_8859_8" => "",58 "windows_1255" => "",59 "iso_8859_9" => "",60 "windows_1254" => "",61 "gb" => "",62 "iscii_de" => "",63 "windows_1257" => "",64 "windows_874" => "",65 "windows_1258" => "",66 "shift_jis" => "",67 "euc_jp" => "",68 "uhc" => ""69 );70 37 71 38 sub print_general_usage { … … 109 76 print STDERR " unicode: just unicode\n"; 110 77 111 print STDERR " iso_8859_1: Latin1 (western european languages)\n"; 112 print STDERR " windows_1252: Windows codepage 1252 (WinLatin1)\n"; 113 114 print STDERR " iso_8859_2: Latin2 (central and eastern european languages)\n"; 115 print STDERR " windows_1250: Windows codepage 1250 (WinLatin2)\n"; 116 117 print STDERR " iso_8859_3: Latin3\n"; 118 119 print STDERR " iso_8859_4: Latin4\n"; 120 121 print STDERR " iso_8859_5: Cyrillic\n"; 122 print STDERR " windows_1251: Windows codepage 1251 (WinCyrillic)\n"; 123 print STDERR " koi8_r: Cyrillic - Russian\n"; 124 print STDERR " koi8_u: Cyrillic - Ukrainian\n"; 125 126 print STDERR " iso_8859_6: Arabic\n"; 127 print STDERR " windows_1256: Windows codepage 1256 (WinArabic)\n"; 128 129 print STDERR " iso_8859_7: Greek\n"; 130 print STDERR " windows_1253: Windows codepage 1253 (WinGreek)\n"; 131 132 print STDERR " iso_8859_8: Hebrew\n"; 133 print STDERR " windows_1255: Windows codepage 1255 (WinHebrew)\n"; 134 135 print STDERR " iso_8859_9: Latin5\n"; 136 print STDERR " windows_1254: Windows codepage 1254 (WinTurkish)\n"; 137 138 print STDERR " gb: GB or GBK simplified Chinese\n"; 139 140 print STDERR " iscii_de: ISCII Devanagari\n"; 141 142 print STDERR " windows_1257: Windows codepage 1257 (WinBaltic)\n"; 143 144 print STDERR " windows_874: Windows codepage 874 (Thai)\n"; 145 146 print STDERR " windows_1258: Windows codepage 1258 (Vietnamese)\n"; 147 148 print STDERR " shift_jis: Shift-JIS (Japanese)\n"; 149 print STDERR " euc_jp: EUC encoded Japanese\n"; 150 151 print STDERR " uhc: Unified Hangul Code (Korean). This is a superset of\n"; 152 print STDERR " EUC encoded Korean\n\n"; 153 78 my $e = $encodings::encodings; 79 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) { 80 print STDERR " $enc: $e->{$enc}->{'name'}\n"; 81 } 154 82 155 83 print STDERR " -default_encoding If -input_encoding is set to 'auto' and the text categorization\n"; … … 196 124 197 125 my $enc = "^("; 198 map {$enc .= "|$_";} keys % supported_encodings;199 my $denc = $enc . " |unicode)\$";200 $enc .= " |unicode|auto)\$";126 map {$enc .= "|$_";} keys %$encodings::encodings; 127 my $denc = $enc . "ascii|utf8|unicode)\$"; 128 $enc .= "ascii|utf8|unicode|auto)\$"; 201 129 202 130 $self->{'outhandle'} = STDERR; … … 444 372 my @results = textcat::classify($text); 445 373 446 # foreach $i (@results) {447 # print STDERR "i: $i\n";448 # }449 450 374 if (scalar @results != 1) { 451 375 … … 468 392 # format language/encoding 469 393 my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/; 470 die "Invalid language\n" if !defined $language; 471 394 if (!defined $language) { 395 if ($self->{'verbosity'}) { 396 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - "; 397 print $outhandle "defaulting to $self->{'default_language'}\n"; 398 } 399 $language = $self->{'default_language'}; 400 } 472 401 if (!defined $encoding) { 473 # if textcat returned no encoding info it is assumed to be iso_8859_1 474 $encoding = "iso_8859_1"; 475 } 476 477 if (!defined $supported_encodings{$encoding}) { 402 if ($self->{'verbosity'}) { 403 print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - "; 404 print $outhandle "defaulting to $self->{'default_encoding'}\n"; 405 } 406 $encoding = $self->{'default_encoding'}; 407 } 408 409 if ($encoding !~ /^(ascii|utf8|unicode)$/ && 410 !defined $encodings::encodings->{$encoding}) { 478 411 if ($self->{'verbosity'}) { 479 412 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - "; -
trunk/gsdl/perllib/unicode.pm
r1868 r1870 30 30 31 31 package unicode; 32 33 %translations = (); 32 use encodings; 34 33 35 34 # ascii2unicode takes an (extended) ascii string (ISO-8859-1) … … 49 48 } 50 49 51 # windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic)) 52 # and returns a unicode array. These encodings are similar to but not 53 # identical to the corresponding ISO-8859 encodings. 54 # 55 # $encoding should be the code page name (e.g. '1252') 56 # 57 # The map files for these encodings should be in unicode/MAPPINGS/WINDOWS 58 sub windows2unicode { 59 my ($encoding, $in) = @_; 60 my $out = []; 61 62 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 63 "WINDOWS", "$encoding.TXT"); 64 return $out unless &loadmapping ($encoding, $mapfile); 65 66 my $i = 0; 67 my $len = length($in); 68 while ($i < $len) { 69 my $c = ord(substr ($in, $i, 1)); 70 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80); 71 push (@$out, $c); 72 $i++; 73 } 74 75 return $out; 76 } 77 78 # iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic)) 79 # and returns a unicode array. This function is much like windows2unicode() 80 # except that only characters >= 0xA0 are read from the mapping file (since 81 # all characters below that are the same for all iso-8859 character sets 82 # and therefore already the same as unicode). 83 # 84 # Note that while this function will work for iso-8859-1 (latin 1) it'll be 85 # much faster to use ascii2unicode() or ascii2utf8() 86 # 87 # $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the 88 # encoding is 89 # 90 # The map files for these encodings should be in unicode/MAPPINGS/ISO_8859 91 sub iso2unicode { 92 my ($encoding, $in) = @_; 93 my $out = []; 94 95 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 96 "ISO_8859", "$encoding.TXT"); 97 return $out unless &loadmapping ($encoding, $mapfile); 98 99 my $i = 0; 100 my $len = length($in); 101 while ($i < $len) { 102 my $c = ord(substr ($in, $i, 1)); 103 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0); 104 push (@$out, $c); 105 $i++; 106 } 107 108 return $out; 109 } 110 111 # cyrillic2unicode is basically identical to windows2unicode, the only 112 # difference being that the map files live in unicode/MAPPINGS/CYRILLIC 113 # 114 # values for $encoding may be 'koi8_r' or 'koi8_u' 115 sub cyrillic2unicode { 116 my ($encoding, $in) = @_; 117 my $out = []; 118 119 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 120 "CYRILLIC", "$encoding.txt"); 121 return $out unless &loadmapping ($encoding, $mapfile); 122 123 my $i = 0; 124 my $len = length($in); 125 while ($i < $len) { 126 my $c = ord(substr ($in, $i, 1)); 127 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80); 128 push (@$out, $c); 129 $i++; 130 } 131 132 return $out; 133 } 134 135 # iscii2unicode is basically identical to iso2unicode, the only 136 # difference being that the map files live in unicode/MAPPINGS/ISCII 137 # 138 # values for $encoding may be 'Devanagari' only at present 139 sub iscii2unicode { 140 my ($encoding, $in) = @_; 141 my $out = []; 142 143 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 144 "ISCII", "$encoding.txt"); 145 return $out unless &loadmapping ($encoding, $mapfile); 146 147 my $i = 0; 148 my $len = length($in); 149 while ($i < $len) { 150 my $c = ord(substr ($in, $i, 1)); 151 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0); 152 push (@$out, $c); 153 $i++; 154 } 155 156 return $out; 157 } 158 159 # ascii2utf8 takes a (extended) ascii string and 160 # returns a UTF-8 encoded string. This is just 161 # a faster version of "&unicode2utf8(&ascii2unicode($str));" 50 # ascii2utf8 takes a reference to an (extended) ascii string and returns a 51 # UTF-8 encoded string. This is just a faster version of 52 # "&unicode2utf8(&ascii2unicode($str));" 162 53 sub ascii2utf8 { 163 54 my ($in) = @_; … … 166 57 my ($c); 167 58 my $i = 0; 168 my $len = length($ in);169 while ($i < $len) { 170 $c = ord (substr ($ in, $i, 1));59 my $len = length($$in); 60 while ($i < $len) { 61 $c = ord (substr ($$in, $i, 1)); 171 62 if ($c < 0x80) { 172 63 # ascii character … … 183 74 return $out; 184 75 } 185 186 76 187 77 # unicode2utf8 takes a unicode array as input and encodes it … … 210 100 } 211 101 } 212 213 return $out; 214 } 215 102 return $out; 103 } 216 104 217 105 # utf82unicode takes a utf-8 string and produces a unicode … … 268 156 } 269 157 270 271 158 # unicode2ucs2 takes a unicode array and produces a UCS-2 272 159 # unicode string (every two bytes forms a unicode character) … … 282 169 return $out; 283 170 } 284 285 171 286 172 # ucs22unicode takes a UCS-2 string and produces a unicode array … … 301 187 } 302 188 303 # loadmapping expects the mapfile to contain (at least) two 304 # tab-separated fields. The first field is the mapped value 305 # and the second field is the unicode value. 306 # 307 # It returns 1 if successful, 0 if unsuccessful 308 sub loadmapping { 309 my ($encoding, $mapfile) = @_; 189 # takes a reference to a string and returns a reference to a unicode array 190 sub convert2unicode { 191 my ($encoding, $textref) = @_; 192 193 if (!defined $encodings::encodings->{$encoding}) { 194 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n"; 195 return []; 196 } 197 198 my $encodename = "$encoding-unicode"; 199 my $enc_info = $encodings::encodings->{$encoding}; 200 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", 201 "to_uc", $enc_info->{'mapfile'}); 202 if (!&loadmapencoding ($encodename, $mapfile)) { 203 print STDERR "unicode: ERROR - could not load encoding $encodename\n"; 204 return []; 205 } 310 206 311 my $to = "$encoding-unicode"; 312 my $from = "unicode-$encoding"; 313 314 # check to see if the encoding has already been loaded 315 if (defined $translations{$to} && defined $translations{$from}) { 316 return 1; 317 } 318 319 if (!open (MAPFILE, $mapfile)) { 320 print STDERR "ERROR: unable to load mapfile $mapfile\n"; 207 if (defined $enc_info->{'converter'}) { 208 my $converter = $enc_info->{'converter'}; 209 return &$converter ($encodename, $textref); 210 } 211 212 if ($translations{$encodename}->{'count'} == 1) { 213 return &singlebyte2unicode ($encodename, $textref); 214 } else { 215 return &doublebyte2unicode ($encodename, $textref); 216 } 217 } 218 219 # singlebyte2unicode converts simple 8 bit encodings where characters below 220 # 0x80 are normal ascii characters and the rest are decoded using the 221 # appropriate mapping files. 222 # 223 # Examples of encodings that may be converted using singlebyte2unicode are 224 # the iso-8859 and windows-125* series). 225 sub singlebyte2unicode { 226 my ($encodename, $textref) = @_; 227 228 my @outtext = (); 229 my $len = length($$textref); 230 my ($c); 231 my $i = 0; 232 233 while ($i < $len) { 234 if (($c = ord(substr($$textref, $i, 1))) < 0x80) { 235 # normal ascii character 236 push (@outtext, $c); 237 } else { 238 $c = &transchar ($encodename, $c); 239 # put a black square if cannot translate 240 $c = 0x25A1 if $c == 0; 241 push (@outtext, $c); 242 } 243 $i ++; 244 } 245 return \@outtext; 246 } 247 248 # doublebyte2unicode converts simple two byte encodings where characters 249 # below code point 0x80 are single-byte characters and the rest are 250 # double-byte characters. 251 # 252 # Examples of encodings that may be converted using doublebyte2unicode are 253 # CJK encodings like GB encoded Chinese and UHC Korean. 254 # 255 # Note that no error checking is performed to make sure that the input text 256 # is valid for the given encoding. 257 # 258 # Also, encodings that may contain characters of more than two bytes are 259 # not supported (any EUC encoded text may in theory contain 3-byte 260 # characters but in practice only one and two byte characters are used). 261 sub doublebyte2unicode { 262 my ($encodename, $textref) = @_; 263 264 my @outtext = (); 265 my $len = length($$textref); 266 my ($c1, $c2); 267 my $i = 0; 268 269 while ($i < $len) { 270 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) { 271 if ($i+1 < $len) { 272 # double-byte character 273 $c2 = ord(substr($$textref, $i+1, 1)); 274 my $c = &transchar ($encodename, ($c1 << 8) | $c2); 275 # put a black square if cannot translate 276 $c = 0x25A1 if $c == 0; 277 push (@outtext, $c); 278 $i += 2; 279 280 } else { 281 # error 282 print STDERR "unicode: ERROR missing second half of double-byte character\n"; 283 $i++; 284 } 285 286 } else { 287 # single-byte character 288 push (@outtext, $c1); 289 $i++; 290 } 291 } 292 return \@outtext; 293 } 294 295 # Shift-JIS to unicode 296 # We can't use doublebyte2unicode for Shift-JIS because it uses some 297 # single-byte characters above code point 0x80 (i.e. half-width katakana 298 # characters in the range 0xA1-0xDF) 299 sub shiftjis2unicode { 300 my ($encodename, $textref) = @_; 301 302 my @outtext = (); 303 my $len = length($$textref); 304 my ($c1, $c2); 305 my $i = 0; 306 307 while ($i < $len) { 308 $c1 = ord(substr($$textref, $i, 1)); 309 310 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) { 311 # Single-byte half-width katakana character or 312 # JIS Roman yen or overline characters 313 my $c = &transchar ($encodename, $c1); 314 # - put a black square if cannot translate 315 $c = 0x25A1 if $c == 0; 316 push (@outtext, $c); 317 $i++; 318 319 } elsif ($c1 < 0x80) { 320 # ASCII 321 push (@outtext, $c1); 322 $i ++; 323 324 } elsif ($c1 < 0xEF) { 325 if ($i+1 < $len) { 326 $c2 = ord(substr($$textref, $i+1, 1)); 327 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) { 328 # Double-byte shift-jis character 329 my $c = &transchar ($encodename, ($c1 << 8) | $c2); 330 # put a black square if cannot translate 331 $c = 0x25A1 if $c == 0; 332 push (@outtext, $c); 333 } else { 334 # error 335 print STDERR "unicode: ERROR Invalid Shift-JIS character\n"; 336 } 337 $i += 2; 338 } else { 339 # error 340 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n"; 341 $i ++; 342 } 343 } else { 344 # error 345 print STDERR "unicode: ERROR Invalid Shift-JIS character\n"; 346 $i ++; 347 } 348 } 349 return \@outtext; 350 } 351 352 sub transchar { 353 my ($encoding, $from) = @_; 354 my $high = ($from / 256) % 256; 355 my $low = $from % 256; 356 357 return 0 unless defined $translations{$encoding}; 358 359 my $block = $translations{$encoding}->{'map'}; 360 361 if (ref ($block->[$high]) ne "ARRAY") { 321 362 return 0; 322 363 } 323 324 my ($line, @line); 325 $translations{$to} = {}; 326 $translations{$from} = {}; 327 while (defined ($line = <MAPFILE>)) { 328 # remove comments 329 $line =~ s/\#.*$//; 330 next unless $line =~ /\S/; 331 332 # split the line into fields and do a few 333 # simple sanity checks 334 @line = split (/\t/, $line); 335 next unless (scalar(@line) >= 2 && 336 $line[0] =~ /^0x/ && 337 $line[1] =~ /^0x/); 338 339 my $a = hex($line[0]); 340 my $b = hex($line[1]); 341 342 $translations{$to}->{$a} = $b; 343 $translations{$from}->{$b} = $a; 344 } 345 346 close (MAPFILE); 347 348 return 1; 349 } 350 351 352 353 354 355 356 357 358 359 #################################################################################################### 360 364 return $block->[$high]->[$low]; 365 } 361 366 362 367 # %translations is of the form: 363 368 # 364 # encodings{encodingname-encodingname}-> blocktranslation369 # encodings{encodingname-encodingname}->{'map'}->blocktranslation 365 370 # blocktranslation->[[0-255],[256-511], ..., [65280-65535]] 366 371 # … … 388 393 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 389 394 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); 390 391 $encodings = {392 'iso_8859_1' => {'fullname' => 'Latin1 (western languages)',393 'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0},394 395 'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)',396 'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0},397 398 'iso_8859_3' => {'fullname' => 'Latin3',399 'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0},400 401 'iso_8859_4' => {'fullname' => 'Latin4',402 'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0},403 404 'iso_8859_5' => {'fullname' => 'Cyrillic',405 'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0},406 407 'iso_8859_6' => {'fullname' => 'Arabic',408 'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0},409 410 'iso_8859_7' => {'fullname' => 'Greek',411 'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0},412 413 'iso_8859_8' => {'fullname' => 'Hebrew',414 'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0},415 416 'iso_8859_9' => {'fullname' => 'Latin5',417 'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0},418 419 'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)',420 'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80},421 422 'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)',423 'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80},424 425 'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)',426 'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80},427 428 'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',429 'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80},430 431 'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)',432 'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80},433 434 'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',435 'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80},436 437 'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',438 'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80},439 440 'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)',441 'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80},442 443 'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)',444 'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80},445 446 'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)',447 'mapfile' => 'win874.ump', 'ascii_delim' => 0x80},448 449 'koi8_r' => {'fullname' => 'Cyrillic',450 'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80},451 452 'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)',453 'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80},454 455 'iscii_de' => {'fullname' => 'ISCII Devanagari',456 'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0}457 };458 459 # returns a pointer to unicode array460 sub simple2unicode {461 my ($encoding, $intext) = @_;462 463 if (!defined ($encodings->{$encoding})) {464 print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n";465 return [];466 }467 468 my $info = $encodings->{$encoding};469 my $encodename = "$encoding-unicode";470 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc",471 $info->{'mapfile'});472 473 if (!&loadmapencoding ($encodename, $mapfile)) {474 print STDERR "unicode: ERROR - could not load encoding $encodename\n";475 return [];476 }477 478 my @outtext = ();479 my $len = length($intext);480 my ($c);481 my $i = 0;482 483 while ($i < $len) {484 if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) {485 # normal ascii character486 push (@outtext, $c);487 } else {488 push (@outtext, &transchar ($encodename, $c));489 }490 $i ++;491 }492 return \@outtext;493 }494 395 495 396 # returns 1 if successful, 0 if unsuccessful … … 503 404 binmode (MAPFILE); 504 405 505 $translations{$encoding} = [@array256];406 $translations{$encoding} = {'map' => [@array256], 'count' => 0}; 506 407 my $block = $translations{$encoding}; 507 408 … … 509 410 while (read(MAPFILE, $in, 1) == 1) { 510 411 $i = unpack ("C", $in); 511 $block-> [$i] = [@array256];412 $block->{'map'}->[$i] = [@array256]; 512 413 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) { 513 414 my ($n1, $n2) = unpack ("CC", $in); 514 $block->[$i]->[$j] = ($n1*256) + $n2; 515 } 415 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2; 416 } 417 $block->{'count'} ++; 516 418 } 517 419 … … 519 421 } 520 422 521 sub transchar {522 my ($encoding, $from) = @_;523 my $high = ($from / 256) % 256;524 my $low = $from % 256;525 526 return 0 unless defined $translations{$encoding};527 528 my $block = $translations{$encoding};529 530 if (ref ($block->[$high]) ne "ARRAY") {531 return 0;532 }533 return $block->[$high]->[$low];534 }535 536 537 538 539 423 1; 540 -
trunk/gsdl/src/recpt/converter.cpp
r1285 r1870 28 28 29 29 30 // the converters within converterinfo become the property of 31 // of this class after add_converter has been called. The converters32 // re main the responsability of the calling code and will not be33 // deleted by thisclass.30 // the converters within converterinfo become the property of this class 31 // after add_converter has been called. The converters remain the 32 // responsability of the calling code and will not be deleted by this 33 // class. 34 34 void convertinfoclass::add_converter (const text_t &name, inconvertclass *inconverter, 35 35 rzwsoutconvertclass *outconverter) { -
trunk/gsdl/src/recpt/converter.h
r1285 r1870 98 98 size_type size() const {return converters.size();} 99 99 100 const_iterator find(text_t &key) {converters.find(key);} 100 101 101 102 // added functionality -
trunk/gsdl/src/recpt/librarymain.cpp
r1860 r1870 39 39 #include "mgsearch.h" 40 40 #include "mgppsearch.h" 41 #include "fileutil.h"42 41 #include "collectset.h" 43 42 #include <assert.h> … … 64 63 #include "htmlbrowserclass.h" 65 64 #include "phindbrowserclass.h" 66 67 #include "recptconfig.h"68 65 69 66 int main () { … … 176 173 // add the protocol to the receptionist 177 174 recpt.add_protocol (&nproto); 178 179 // z39.50 stuff - johnmcp180 175 181 176 // z39.50 stuff - johnmcp … … 200 195 #endif 201 196 202 // Read main.cfg to get all the "Encoding" lines and add corresponding converters.203 // It might be possible to move this to somewhere like receptionist::configure, depending204 // on whether we need the converters before then (I don't think we do).205 text_tarray cfgline;206 text_t maincfg = filename_cat (gsdlhome, "etc", "main.cfg");207 if (file_exists (maincfg)) {208 char *maincfgc = maincfg.getcstr();209 #ifdef GSDL_USE_IOS_H210 ifstream confin (maincfgc, ios::in | ios::nocreate);211 #else212 ifstream confin (maincfgc, ios::in);213 #endif214 delete maincfgc;215 216 if (confin) {217 text_t subkey, subvalue, shortname;218 text_tset saved;219 text_tmap tmp;220 text_t::const_iterator cfglinesub_here;221 text_tarray::const_iterator cfgline_here;222 text_tarray::const_iterator cfgline_end;223 while (read_cfg_line(confin, cfgline) >= 0) {224 if (cfgline.size () >= 4 && cfgline[0] == "Encoding") {225 tmp.erase(tmp.begin(), tmp.end());226 cfgline_here = cfgline.begin();227 cfgline_end = cfgline.end();228 while (cfgline_here != cfgline_end) {229 cfglinesub_here = getdelimitstr((*cfgline_here).begin(),230 (*cfgline_here).end(), '=', subkey);231 if (subkey == "shortname") {232 shortname = substr (cfglinesub_here, (*cfgline_here).end());233 } else {234 tmp[subkey] = substr (cfglinesub_here, (*cfgline_here).end());235 }236 cfgline_here++;237 }238 // we just use the saved set to prevent multiple encodings being added239 // that use the same shortname (i.e. any encodings after the first with240 // the same name will be ignored).241 if (!shortname.empty() && saved.find(shortname) == saved.end()) {242 saved.insert(shortname);243 244 if (tmp["type"] == "UTF8") {245 utf8inconvertclass *utf8inconvert = new utf8inconvertclass();246 utf8outconvertclass *utf8outconvert = new utf8outconvertclass();247 recpt.add_converter (shortname, utf8inconvert, utf8outconvert);248 249 } else if (tmp["type"] == "GB") {250 mapinconvertclass *gbinconvert = new mapinconvertclass();251 gbinconvert->setmapfile (gsdlhome, "gbku", 0x25a1);252 mapoutconvertclass *gboutconvert = new mapoutconvertclass();253 gboutconvert->setmapfile (gsdlhome, "ugbk", 0xa1f5);254 recpt.add_converter (shortname, gbinconvert, gboutconvert);255 256 } else {257 if (!tmp["mapfile"].empty()) {258 259 if (tmp["type"] == "ISO_8859" && tmp["mapfile"] == "1.TXT") {260 // iso-8859-1 is a special case as it'll always be261 // supported by the standard converter class and262 // therefore doesn't need to use its mapping file263 inconvertclass *inconvert = new inconvertclass();264 rzwsoutconvertclass *outconvert = new rzwsoutconvertclass();265 recpt.add_converter (shortname, inconvert, outconvert);266 267 } else {268 text_t mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS", tmp["type"], tmp["mapfile"]);269 if (file_exists (mapfile)) {270 simplemapinconvertclass *inconvert = new simplemapinconvertclass();271 inconvert->setmapfile (mapfile);272 simplemapoutconvertclass *outconvert = new simplemapoutconvertclass();273 outconvert->setmapfile (mapfile);274 recpt.add_converter (shortname, inconvert, outconvert);275 }276 }277 }278 }279 }280 }281 }282 confin.close ();283 }284 }285 286 287 // add other converters288 // utf8inconvertclass utf8inconvert;289 // utf8outconvertclass utf8outconvert;290 // recpt.add_converter ("u", &utf8inconvert, &utf8outconvert);291 292 // mapinconvertclass gbinconvert;293 // gbinconvert.setmapfile (gsdlhome, "gbku", 0x25a1);294 // mapoutconvertclass gboutconvert;295 // gboutconvert.setmapfile (gsdlhome, "ugbk", 0xa1f5);296 // recpt.add_converter ("g", &gbinconvert, &gboutconvert);297 298 // arabic299 // text_t mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");300 // mapfile = filename_cat (mapfile, "WINDOWS", "1256.TXT");301 // simplemapinconvertclass arinconvert;302 // arinconvert.setmapfile (mapfile);303 // simplemapoutconvertclass aroutconvert;304 // aroutconvert.setmapfile (mapfile);305 // recpt.add_converter ("a", &arinconvert, &aroutconvert);306 307 // cyrillic308 // mapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");309 // mapfile = filename_cat (mapfile, "WINDOWS", "1251.TXT");310 // simplemapinconvertclass cyinconvert;311 // cyinconvert.setmapfile (mapfile);312 // simplemapoutconvertclass cyoutconvert;313 // cyoutconvert.setmapfile (mapfile);314 // recpt.add_converter ("c", &cyinconvert, &cyoutconvert);315 316 // hindi317 // armapfile = filename_cat (gsdlhome, "unicode", "MAPPINGS");318 // armapfile = filename_cat (armapfile, "ISCII", "Devanagari.txt");319 // simplemapinconvertclass arinconvert;320 // arinconvert.setmapfile (armapfile);321 // simplemapoutconvertclass aroutconvert;322 // aroutconvert.setmapfile (armapfile);323 // recpt.add_converter ("a", &arinconvert, &aroutconvert);324 325 326 197 // the list of actions. Note: these actions will become invalid 327 198 // at the end of this function. -
trunk/gsdl/src/recpt/pageaction.cpp
r1861 r1870 353 353 } 354 354 355 if (pref_langs.find("zh") == pref_langs.end())356 disp.setmacro ("encodingoption", "preferences", "");357 358 355 } else { 359 356 while (tlang != elang) { … … 373 370 // create the "encoding" selection box for the preferences page 374 371 text_t &arg_w = args["w"]; 375 // put encodings in another map to sort them by longname 376 text_tmap encodings; 377 encodinginfo_tmap::const_iterator thisenc = configinfo.encodings.begin(); 378 encodinginfo_tmap::const_iterator endenc = configinfo.encodings.end(); 372 text_t encodingoption; 373 text_tmap::const_iterator thisenc = configinfo.encodings.begin(); 374 text_tmap::const_iterator endenc = configinfo.encodings.end(); 379 375 while (thisenc != endenc) { 380 encodings[(*thisenc).second.longname] = (*thisenc).first; 381 thisenc++; 382 } 383 text_tmap::iterator tenc = encodings.begin(); 384 text_tmap::iterator eenc = encodings.end(); 385 386 text_t encodingoption; 387 while (tenc != eenc) { 388 encodingoption += "<option value=\"" + (*tenc).second + "\""; 389 if ((*tenc).second == arg_w) encodingoption += " selected"; 390 encodingoption += ">" + (*tenc).first + "\n"; 391 tenc ++; 376 encodingoption += "<option value=\"" + (*thisenc).second + "\""; 377 if ((*thisenc).second == arg_w) encodingoption += " selected"; 378 encodingoption += ">" + (*thisenc).first + "\n"; 379 thisenc ++; 392 380 } 393 381 -
trunk/gsdl/src/recpt/receptionist.cpp
r1861 r1870 94 94 } 95 95 96 void encodinginfo_t::clear () {97 longname.clear();98 label.clear();99 }100 101 96 receptionist::receptionist () { 102 97 // create a list of cgi arguments … … 207 202 void receptionist::configure (const text_t &key, const text_tarray &cfgline) { 208 203 // configure the receptionist 204 209 205 if (cfgline.size() >= 1) { 210 206 cgiarginfo *info = NULL; … … 317 313 318 314 } else if (key == "Encoding") { 319 text_t subkey, subvalue; 320 text_t shortname, longname, label; 315 316 configure_encoding (cfgline); 317 318 } else if (key == "Language") { 319 text_t subkey, subvalue, shortname; 320 languageinfo_t lang; 321 321 text_t::const_iterator cfglinesub_here; 322 322 text_tarray::const_iterator cfgline_here = cfgline.begin(); … … 328 328 shortname = substr (cfglinesub_here, (*cfgline_here).end()); 329 329 } else if (subkey == "longname") { 330 longname = substr (cfglinesub_here, (*cfgline_here).end()); 331 } else if (subkey == "label") { 332 label = substr (cfglinesub_here, (*cfgline_here).end()); 333 } 334 cfgline_here++; 335 } 336 if (!shortname.empty() && !label.empty()) { 337 encodinginfo_t enc; 338 if (longname.empty()) enc.longname = shortname; 339 else enc.longname = longname; 340 enc.label = label; 341 configinfo.encodings[shortname] = enc; 342 } 343 344 } else if (key == "Language") { 345 text_t subkey, subvalue; 346 text_t shortname, longname, defaultencoding; 347 text_t::const_iterator cfglinesub_here; 348 text_tarray::const_iterator cfgline_here = cfgline.begin(); 349 text_tarray::const_iterator cfgline_end = cfgline.end(); 350 while (cfgline_here != cfgline_end) { 351 cfglinesub_here = getdelimitstr((*cfgline_here).begin(), 352 (*cfgline_here).end(), '=', subkey); 353 if (subkey == "shortname") { 354 shortname = substr (cfglinesub_here, (*cfgline_here).end()); 355 } else if (subkey == "longname") { 356 longname = substr (cfglinesub_here, (*cfgline_here).end()); 330 lang.longname = substr (cfglinesub_here, (*cfgline_here).end()); 357 331 } else if (subkey == "default_encoding") { 358 defaultencoding = substr (cfglinesub_here, (*cfgline_here).end());332 lang.defaultencoding = substr (cfglinesub_here, (*cfgline_here).end()); 359 333 } 360 334 cfgline_here++; 361 335 } 362 336 if (!shortname.empty()) { 363 languageinfo_t lang; 364 if (longname.empty()) lang.longname = shortname; 365 else lang.longname = longname; 366 lang.defaultencoding = defaultencoding; 337 if (lang.longname.empty()) lang.longname = shortname; 367 338 configinfo.languages[shortname] = lang; 368 339 } … … 415 386 416 387 417 // init should be called after all the actions, protocols, and 418 // converters have been added to the receptionist and after everything 419 // has been configured but before any pages are created. 420 // It returns true on success and false on failure. If false is 421 // returned getpage should not be called (without producing 422 // meaningless output), instead an error page should be 423 // produced by the calling code. 388 // init should be called after all the actions and protocols have been 389 // added to the receptionist and after everything has been configured but 390 // before any pages are created. It returns true on success and false on 391 // failure. If false is returned getpage should not be called (without 392 // producing meaningless output), instead an error page should be produced 393 // by the calling code. 424 394 bool receptionist::init (ostream &logout) { 395 425 396 // first configure collectdir 426 397 text_t thecollectdir = configinfo.gsdlhome; … … 470 441 srand (time(NULL)); 471 442 472 // make the output converters remove all the zero-width spaces473 convertinfoclass::iterator converthere = converters.begin ();474 convertinfoclass::iterator convertend = converters.end ();475 while (converthere != convertend) {476 assert ((*converthere).second.outconverter != NULL);477 if ((*converthere).second.outconverter != NULL) {478 (*converthere).second.outconverter->set_rzws(1);479 }480 converthere++;481 }482 483 443 // if maintainer email address is something dodgy (for now I'll define 484 444 // dodgy as being anything that doesn't contain '@') disable EmailEvents … … 539 499 540 500 // make sure the encoding is valid 541 if (con figinfo.encodings.find(default_encoding) == configinfo.encodings.end()) return "";501 if (converters.find(default_encoding) == converters.end()) return ""; 542 502 543 503 return default_encoding; … … 828 788 // add the encoding information 829 789 if (response == content) { 830 if (con figinfo.encodings.find(args["w"]) != configinfo.encodings.end()) {831 response_data += "; charset=" + configinfo.encodings[args["w"]].label;790 if (converters.find(args["w"]) != converters.end()) { 791 response_data += "; charset=" + args["w"]; 832 792 } else { 833 793 // default to latin 1 … … 1335 1295 } 1336 1296 1337 1297 // Handles an "Encoding" line from a configuration file - note that the 1298 // configinfo.encodings map is a bit of a hack (to be fixed when the 1299 // configuration files are tidied up). 1300 void receptionist::configure_encoding (const text_tarray &cfgline) { 1301 1302 text_t subkey, subvalue, shortname, longname, mapfile; 1303 text_t::const_iterator cfglinesub_here; 1304 text_tarray::const_iterator cfgline_here = cfgline.begin(); 1305 text_tarray::const_iterator cfgline_end = cfgline.end(); 1306 while (cfgline_here != cfgline_end) { 1307 cfglinesub_here = getdelimitstr((*cfgline_here).begin(), 1308 (*cfgline_here).end(), '=', subkey); 1309 if (subkey == "shortname") { 1310 shortname = substr (cfglinesub_here, (*cfgline_here).end()); 1311 } else if (subkey == "longname") { 1312 longname = substr (cfglinesub_here, (*cfgline_here).end()); 1313 } else if (subkey == "map") { 1314 mapfile = substr (cfglinesub_here, (*cfgline_here).end()); 1315 } 1316 cfgline_here++; 1317 } 1318 if (!shortname.empty()) { 1319 if (longname.empty()) longname = shortname; 1320 1321 // add the converter 1322 if (shortname == "utf-8") { 1323 utf8inconvertclass *utf8inconvert = new utf8inconvertclass(); 1324 utf8outconvertclass *utf8outconvert = new utf8outconvertclass(); 1325 utf8outconvert->set_rzws(1); 1326 add_converter (shortname, utf8inconvert, utf8outconvert); 1327 configinfo.encodings[longname] = shortname; 1328 1329 } else if (!mapfile.empty()) { 1330 1331 if (mapfile == "8859_1.ump") { 1332 // iso-8859-1 is a special case as it'll always be supported by the 1333 // standard converter class and therefore doesn't need to use its 1334 // mapping file 1335 inconvertclass *inconvert = new inconvertclass(); 1336 rzwsoutconvertclass *outconvert = new rzwsoutconvertclass(); 1337 outconvert->set_rzws(1); 1338 add_converter (shortname, inconvert, outconvert); 1339 configinfo.encodings[longname] = shortname; 1340 1341 } else { 1342 text_t to_uc_map = filename_cat(configinfo.gsdlhome, "mappings", "to_uc", mapfile); 1343 text_t from_uc_map = filename_cat(configinfo.gsdlhome, "mappings", "from_uc", mapfile); 1344 if (file_exists(to_uc_map) && file_exists(from_uc_map)) { 1345 1346 mapinconvertclass *mapinconvert = new mapinconvertclass(); 1347 mapinconvert->setmapfile (to_uc_map, 0x003F); 1348 mapoutconvertclass *mapoutconvert = new mapoutconvertclass(); 1349 mapoutconvert->setmapfile (from_uc_map, 0x3F); 1350 mapoutconvert->set_rzws(1); 1351 add_converter (shortname, mapinconvert, mapoutconvert); 1352 configinfo.encodings[longname] = shortname; 1353 } 1354 } 1355 } 1356 } 1357 } -
trunk/gsdl/src/recpt/receptionist.h
r1860 r1870 69 69 }; 70 70 71 struct encodinginfo_t {72 void clear();73 encodinginfo_t () {clear();}74 75 text_t longname;76 text_t label;77 };78 79 71 typedef map<text_t, collectioninfo_t, lttext_t> colinfo_tmap; 80 72 typedef map<text_t, languageinfo_t, lttext_t> languageinfo_tmap; 81 typedef map<text_t, encodinginfo_t, lttext_t> encodinginfo_tmap;82 73 83 74 enum events_t {Disabled, CollectorEvents, AllEvents}; … … 111 102 112 103 languageinfo_tmap languages; 113 encodinginfo_tmap encodings; 104 105 // encodings is just a simple mapping from encoding longnames to 106 // shortnames. It's useful for now for creating the pulldown menu of 107 // encodings on the preferences page but isn't intended to be permanent. 108 text_tmap encodings; 114 109 115 110 void clear (); … … 150 145 bool append_logstr (const text_t &filename, const text_t &logstr, 151 146 ostream &logout); 147 148 void configure_encoding (const text_tarray &cfgline); 152 149 153 150 public:
Note:
See TracChangeset
for help on using the changeset viewer.