source: extensions/gsdl-video/trunk/installed/cmdline/lib/ruby/1.8/xsd/charset.rb@ 18425

Last change on this file since 18425 was 18425, checked in by davidb, 15 years ago

Video extension to Greenstone

File size: 5.2 KB
Line 
1# XSD4R - Charset handling library.
2# Copyright (C) 2001, 2003, 2005 NAKAMURA, Hiroshi <[email protected]>.
3
4# This program is copyrighted free software by NAKAMURA, Hiroshi. You can
5# redistribute it and/or modify it under the same terms of Ruby's license;
6# either the dual license version in 2003, or any later version.
7
8
9module XSD
10
11
12module Charset
13 @internal_encoding = $KCODE
14
15 class XSDError < StandardError; end
16 class CharsetError < XSDError; end
17 class UnknownCharsetError < CharsetError; end
18 class CharsetConversionError < CharsetError; end
19
20public
21
22 ###
23 ## Maps
24 #
25 EncodingConvertMap = {}
26 def Charset.init
27 EncodingConvertMap[['UTF8', 'X_ISO8859_1']] =
28 Proc.new { |str| str.unpack('U*').pack('C*') }
29 EncodingConvertMap[['X_ISO8859_1', 'UTF8']] =
30 Proc.new { |str| str.unpack('C*').pack('U*') }
31 begin
32 require 'xsd/iconvcharset'
33 @internal_encoding = 'UTF8'
34 sjtag = (/(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM) ? 'cp932' :
35 'shift_jis'
36 EncodingConvertMap[['UTF8', 'EUC' ]] =
37 Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) }
38 EncodingConvertMap[['EUC' , 'UTF8']] =
39 Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) }
40 EncodingConvertMap[['EUC' , 'SJIS']] =
41 Proc.new { |str| IconvCharset.safe_iconv(sjtag, "euc-jp", str) }
42 EncodingConvertMap[['UTF8', 'SJIS']] =
43 Proc.new { |str| IconvCharset.safe_iconv(sjtag, "utf-8", str) }
44 EncodingConvertMap[['SJIS', 'UTF8']] =
45 Proc.new { |str| IconvCharset.safe_iconv("utf-8", sjtag, str) }
46 EncodingConvertMap[['SJIS', 'EUC' ]] =
47 Proc.new { |str| IconvCharset.safe_iconv("euc-jp", sjtag, str) }
48 rescue LoadError
49 begin
50 require 'nkf'
51 EncodingConvertMap[['EUC' , 'SJIS']] =
52 Proc.new { |str| NKF.nkf('-sXm0', str) }
53 EncodingConvertMap[['SJIS', 'EUC' ]] =
54 Proc.new { |str| NKF.nkf('-eXm0', str) }
55 rescue LoadError
56 end
57
58 begin
59 require 'uconv'
60 @internal_encoding = 'UTF8'
61 EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc)
62 EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis)
63 EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8)
64 EncodingConvertMap[['SJIS', 'UTF8']] = Uconv.method(:sjistou8)
65 rescue LoadError
66 end
67 end
68 end
69 self.init
70
71 CharsetMap = {
72 'NONE' => 'us-ascii',
73 'EUC' => 'euc-jp',
74 'SJIS' => 'shift_jis',
75 'UTF8' => 'utf-8',
76 'X_ISO_8859_1' => 'iso-8859-1',
77 'X_UNKNOWN' => nil,
78 }
79
80
81 ###
82 ## handlers
83 #
84 def Charset.encoding
85 @internal_encoding
86 end
87
88 def Charset.encoding=(encoding)
89 warn("xsd charset is set to #{encoding}") if $DEBUG
90 @internal_encoding = encoding
91 end
92
93 def Charset.xml_encoding_label
94 charset_label(@internal_encoding)
95 end
96
97 def Charset.encoding_to_xml(str, charset)
98 encoding_conv(str, @internal_encoding, charset_str(charset))
99 end
100
101 def Charset.encoding_from_xml(str, charset)
102 encoding_conv(str, charset_str(charset), @internal_encoding)
103 end
104
105 def Charset.encoding_conv(str, enc_from, enc_to)
106 if enc_from == enc_to or enc_from == 'NONE' or enc_to == 'NONE'
107 str
108 elsif converter = EncodingConvertMap[[enc_from, enc_to]]
109 converter.call(str)
110 else
111 raise CharsetConversionError.new(
112 "Converter not found: #{enc_from} -> #{enc_to}")
113 end
114 end
115
116 def Charset.charset_label(encoding)
117 CharsetMap[encoding.upcase]
118 end
119
120 def Charset.charset_str(label)
121 if CharsetMap.respond_to?(:key)
122 CharsetMap.key(label.downcase) || 'X_UNKNOWN'
123 else
124 CharsetMap.index(label.downcase) || 'X_UNKNOWN'
125 end
126 end
127
128 # us_ascii = '[\x00-\x7F]'
129 us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted.
130 USASCIIRegexp = Regexp.new("\\A#{us_ascii}*\\z", nil, "NONE")
131
132 twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'
133 threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'
134 character_euc = "(?:#{us_ascii}|#{twobytes_euc}|#{threebytes_euc})"
135 EUCRegexp = Regexp.new("\\A#{character_euc}*\\z", nil, "NONE")
136
137 # onebyte_sjis = '[\x00-\x7F\xA1-\xDF]'
138 onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]' # XML 1.0 restricted.
139 twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])'
140 character_sjis = "(?:#{onebyte_sjis}|#{twobytes_sjis})"
141 SJISRegexp = Regexp.new("\\A#{character_sjis}*\\z", nil, "NONE")
142
143 # 0xxxxxxx
144 # 110yyyyy 10xxxxxx
145 twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])'
146 # 1110zzzz 10yyyyyy 10xxxxxx
147 threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])'
148 # 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx
149 fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])'
150 character_utf8 =
151 "(?:#{us_ascii}|#{twobytes_utf8}|#{threebytes_utf8}|#{fourbytes_utf8})"
152 UTF8Regexp = Regexp.new("\\A#{character_utf8}*\\z", nil, "NONE")
153
154 def Charset.is_us_ascii(str)
155 USASCIIRegexp =~ str
156 end
157
158 def Charset.is_utf8(str)
159 UTF8Regexp =~ str
160 end
161
162 def Charset.is_euc(str)
163 EUCRegexp =~ str
164 end
165
166 def Charset.is_sjis(str)
167 SJISRegexp =~ str
168 end
169
170 def Charset.is_ces(str, code = $KCODE)
171 case code
172 when 'NONE'
173 is_us_ascii(str)
174 when 'UTF8'
175 is_utf8(str)
176 when 'EUC'
177 is_euc(str)
178 when 'SJIS'
179 is_sjis(str)
180 else
181 raise UnknownCharsetError.new("Unknown charset: #{code}")
182 end
183 end
184end
185
186
187end
Note: See TracBrowser for help on using the repository browser.