1 | # XSD4R - Charset handling library.
|
---|
2 | # Copyright (C) 2001, 2003, 2005 NAKAMURA, Hiroshi <[email protected]>.
|
---|
3 |
|
---|
4 | # This program is copyrighted free software by NAKAMURA, Hiroshi. You can
|
---|
5 | # redistribute it and/or modify it under the same terms of Ruby's license;
|
---|
6 | # either the dual license version in 2003, or any later version.
|
---|
7 |
|
---|
8 |
|
---|
9 | module XSD
|
---|
10 |
|
---|
11 |
|
---|
12 | module Charset
|
---|
13 | @internal_encoding = $KCODE
|
---|
14 |
|
---|
15 | class XSDError < StandardError; end
|
---|
16 | class CharsetError < XSDError; end
|
---|
17 | class UnknownCharsetError < CharsetError; end
|
---|
18 | class CharsetConversionError < CharsetError; end
|
---|
19 |
|
---|
20 | public
|
---|
21 |
|
---|
22 | ###
|
---|
23 | ## Maps
|
---|
24 | #
|
---|
25 | EncodingConvertMap = {}
|
---|
26 | def Charset.init
|
---|
27 | EncodingConvertMap[['UTF8', 'X_ISO8859_1']] =
|
---|
28 | Proc.new { |str| str.unpack('U*').pack('C*') }
|
---|
29 | EncodingConvertMap[['X_ISO8859_1', 'UTF8']] =
|
---|
30 | Proc.new { |str| str.unpack('C*').pack('U*') }
|
---|
31 | begin
|
---|
32 | require 'xsd/iconvcharset'
|
---|
33 | @internal_encoding = 'UTF8'
|
---|
34 | sjtag = (/(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM) ? 'cp932' :
|
---|
35 | 'shift_jis'
|
---|
36 | EncodingConvertMap[['UTF8', 'EUC' ]] =
|
---|
37 | Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) }
|
---|
38 | EncodingConvertMap[['EUC' , 'UTF8']] =
|
---|
39 | Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) }
|
---|
40 | EncodingConvertMap[['EUC' , 'SJIS']] =
|
---|
41 | Proc.new { |str| IconvCharset.safe_iconv(sjtag, "euc-jp", str) }
|
---|
42 | EncodingConvertMap[['UTF8', 'SJIS']] =
|
---|
43 | Proc.new { |str| IconvCharset.safe_iconv(sjtag, "utf-8", str) }
|
---|
44 | EncodingConvertMap[['SJIS', 'UTF8']] =
|
---|
45 | Proc.new { |str| IconvCharset.safe_iconv("utf-8", sjtag, str) }
|
---|
46 | EncodingConvertMap[['SJIS', 'EUC' ]] =
|
---|
47 | Proc.new { |str| IconvCharset.safe_iconv("euc-jp", sjtag, str) }
|
---|
48 | rescue LoadError
|
---|
49 | begin
|
---|
50 | require 'nkf'
|
---|
51 | EncodingConvertMap[['EUC' , 'SJIS']] =
|
---|
52 | Proc.new { |str| NKF.nkf('-sXm0', str) }
|
---|
53 | EncodingConvertMap[['SJIS', 'EUC' ]] =
|
---|
54 | Proc.new { |str| NKF.nkf('-eXm0', str) }
|
---|
55 | rescue LoadError
|
---|
56 | end
|
---|
57 |
|
---|
58 | begin
|
---|
59 | require 'uconv'
|
---|
60 | @internal_encoding = 'UTF8'
|
---|
61 | EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc)
|
---|
62 | EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis)
|
---|
63 | EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8)
|
---|
64 | EncodingConvertMap[['SJIS', 'UTF8']] = Uconv.method(:sjistou8)
|
---|
65 | rescue LoadError
|
---|
66 | end
|
---|
67 | end
|
---|
68 | end
|
---|
69 | self.init
|
---|
70 |
|
---|
71 | CharsetMap = {
|
---|
72 | 'NONE' => 'us-ascii',
|
---|
73 | 'EUC' => 'euc-jp',
|
---|
74 | 'SJIS' => 'shift_jis',
|
---|
75 | 'UTF8' => 'utf-8',
|
---|
76 | 'X_ISO_8859_1' => 'iso-8859-1',
|
---|
77 | 'X_UNKNOWN' => nil,
|
---|
78 | }
|
---|
79 |
|
---|
80 |
|
---|
81 | ###
|
---|
82 | ## handlers
|
---|
83 | #
|
---|
84 | def Charset.encoding
|
---|
85 | @internal_encoding
|
---|
86 | end
|
---|
87 |
|
---|
88 | def Charset.encoding=(encoding)
|
---|
89 | warn("xsd charset is set to #{encoding}") if $DEBUG
|
---|
90 | @internal_encoding = encoding
|
---|
91 | end
|
---|
92 |
|
---|
93 | def Charset.xml_encoding_label
|
---|
94 | charset_label(@internal_encoding)
|
---|
95 | end
|
---|
96 |
|
---|
97 | def Charset.encoding_to_xml(str, charset)
|
---|
98 | encoding_conv(str, @internal_encoding, charset_str(charset))
|
---|
99 | end
|
---|
100 |
|
---|
101 | def Charset.encoding_from_xml(str, charset)
|
---|
102 | encoding_conv(str, charset_str(charset), @internal_encoding)
|
---|
103 | end
|
---|
104 |
|
---|
105 | def Charset.encoding_conv(str, enc_from, enc_to)
|
---|
106 | if enc_from == enc_to or enc_from == 'NONE' or enc_to == 'NONE'
|
---|
107 | str
|
---|
108 | elsif converter = EncodingConvertMap[[enc_from, enc_to]]
|
---|
109 | converter.call(str)
|
---|
110 | else
|
---|
111 | raise CharsetConversionError.new(
|
---|
112 | "Converter not found: #{enc_from} -> #{enc_to}")
|
---|
113 | end
|
---|
114 | end
|
---|
115 |
|
---|
116 | def Charset.charset_label(encoding)
|
---|
117 | CharsetMap[encoding.upcase]
|
---|
118 | end
|
---|
119 |
|
---|
120 | def Charset.charset_str(label)
|
---|
121 | if CharsetMap.respond_to?(:key)
|
---|
122 | CharsetMap.key(label.downcase) || 'X_UNKNOWN'
|
---|
123 | else
|
---|
124 | CharsetMap.index(label.downcase) || 'X_UNKNOWN'
|
---|
125 | end
|
---|
126 | end
|
---|
127 |
|
---|
128 | # us_ascii = '[\x00-\x7F]'
|
---|
129 | us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted.
|
---|
130 | USASCIIRegexp = Regexp.new("\\A#{us_ascii}*\\z", nil, "NONE")
|
---|
131 |
|
---|
132 | twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'
|
---|
133 | threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'
|
---|
134 | character_euc = "(?:#{us_ascii}|#{twobytes_euc}|#{threebytes_euc})"
|
---|
135 | EUCRegexp = Regexp.new("\\A#{character_euc}*\\z", nil, "NONE")
|
---|
136 |
|
---|
137 | # onebyte_sjis = '[\x00-\x7F\xA1-\xDF]'
|
---|
138 | onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]' # XML 1.0 restricted.
|
---|
139 | twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])'
|
---|
140 | character_sjis = "(?:#{onebyte_sjis}|#{twobytes_sjis})"
|
---|
141 | SJISRegexp = Regexp.new("\\A#{character_sjis}*\\z", nil, "NONE")
|
---|
142 |
|
---|
143 | # 0xxxxxxx
|
---|
144 | # 110yyyyy 10xxxxxx
|
---|
145 | twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])'
|
---|
146 | # 1110zzzz 10yyyyyy 10xxxxxx
|
---|
147 | threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])'
|
---|
148 | # 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx
|
---|
149 | fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])'
|
---|
150 | character_utf8 =
|
---|
151 | "(?:#{us_ascii}|#{twobytes_utf8}|#{threebytes_utf8}|#{fourbytes_utf8})"
|
---|
152 | UTF8Regexp = Regexp.new("\\A#{character_utf8}*\\z", nil, "NONE")
|
---|
153 |
|
---|
154 | def Charset.is_us_ascii(str)
|
---|
155 | USASCIIRegexp =~ str
|
---|
156 | end
|
---|
157 |
|
---|
158 | def Charset.is_utf8(str)
|
---|
159 | UTF8Regexp =~ str
|
---|
160 | end
|
---|
161 |
|
---|
162 | def Charset.is_euc(str)
|
---|
163 | EUCRegexp =~ str
|
---|
164 | end
|
---|
165 |
|
---|
166 | def Charset.is_sjis(str)
|
---|
167 | SJISRegexp =~ str
|
---|
168 | end
|
---|
169 |
|
---|
170 | def Charset.is_ces(str, code = $KCODE)
|
---|
171 | case code
|
---|
172 | when 'NONE'
|
---|
173 | is_us_ascii(str)
|
---|
174 | when 'UTF8'
|
---|
175 | is_utf8(str)
|
---|
176 | when 'EUC'
|
---|
177 | is_euc(str)
|
---|
178 | when 'SJIS'
|
---|
179 | is_sjis(str)
|
---|
180 | else
|
---|
181 | raise UnknownCharsetError.new("Unknown charset: #{code}")
|
---|
182 | end
|
---|
183 | end
|
---|
184 | end
|
---|
185 |
|
---|
186 |
|
---|
187 | end
|
---|