1 | #
|
---|
2 | # kconv.rb - Kanji Converter.
|
---|
3 | #
|
---|
4 | # $Id: kconv.rb 11708 2007-02-12 23:01:19Z shyouhei $
|
---|
5 | #
|
---|
6 | # ----
|
---|
7 | #
|
---|
8 | # kconv.rb implements the Kconv class for Kanji Converter. Additionally,
|
---|
9 | # some methods in String classes are added to allow easy conversion.
|
---|
10 | #
|
---|
11 |
|
---|
12 | require 'nkf'
|
---|
13 |
|
---|
14 | #
|
---|
15 | # Kanji Converter for Ruby.
|
---|
16 | #
|
---|
17 | module Kconv
|
---|
18 | #
|
---|
19 | # Public Constants
|
---|
20 | #
|
---|
21 |
|
---|
22 | #Constant of Encoding
|
---|
23 |
|
---|
24 | # Auto-Detect
|
---|
25 | AUTO = NKF::AUTO
|
---|
26 | # ISO-2022-JP
|
---|
27 | JIS = NKF::JIS
|
---|
28 | # EUC-JP
|
---|
29 | EUC = NKF::EUC
|
---|
30 | # Shift_JIS
|
---|
31 | SJIS = NKF::SJIS
|
---|
32 | # BINARY
|
---|
33 | BINARY = NKF::BINARY
|
---|
34 | # NOCONV
|
---|
35 | NOCONV = NKF::NOCONV
|
---|
36 | # ASCII
|
---|
37 | ASCII = NKF::ASCII
|
---|
38 | # UTF-8
|
---|
39 | UTF8 = NKF::UTF8
|
---|
40 | # UTF-16
|
---|
41 | UTF16 = NKF::UTF16
|
---|
42 | # UTF-32
|
---|
43 | UTF32 = NKF::UTF32
|
---|
44 | # UNKNOWN
|
---|
45 | UNKNOWN = NKF::UNKNOWN
|
---|
46 |
|
---|
47 | #
|
---|
48 | # Private Constants
|
---|
49 | #
|
---|
50 |
|
---|
51 | # Revision of kconv.rb
|
---|
52 | REVISION = %q$Revision: 11708 $
|
---|
53 |
|
---|
54 | #Regexp of Encoding
|
---|
55 |
|
---|
56 | # Regexp of Shift_JIS string (private constant)
|
---|
57 | RegexpShiftjis = /\A(?:
|
---|
58 | [\x00-\x7f\xa1-\xdf] |
|
---|
59 | [\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]
|
---|
60 | )*\z/nx
|
---|
61 |
|
---|
62 | # Regexp of EUC-JP string (private constant)
|
---|
63 | RegexpEucjp = /\A(?:
|
---|
64 | [\x00-\x7f] |
|
---|
65 | \x8e [\xa1-\xdf] |
|
---|
66 | \x8f [\xa1-\xfe] [\xa1-\xfe] |
|
---|
67 | [\xa1-\xfe] [\xa1-\xfe]
|
---|
68 | )*\z/nx
|
---|
69 |
|
---|
70 | # Regexp of UTF-8 string (private constant)
|
---|
71 | RegexpUtf8 = /\A(?:
|
---|
72 | [\x00-\x7f] |
|
---|
73 | [\xc2-\xdf] [\x80-\xbf] |
|
---|
74 | \xe0 [\xa0-\xbf] [\x80-\xbf] |
|
---|
75 | [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
|
---|
76 | \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
|
---|
77 | [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
|
---|
78 | \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
|
---|
79 | )*\z/nx
|
---|
80 |
|
---|
81 | #
|
---|
82 | # Public Methods
|
---|
83 | #
|
---|
84 |
|
---|
85 | # call-seq:
|
---|
86 | # Kconv.kconv(str, out_code, in_code = Kconv::AUTO)
|
---|
87 | #
|
---|
88 | # Convert <code>str</code> to out_code.
|
---|
89 | # <code>out_code</code> and <code>in_code</code> are given as constants of Kconv.
|
---|
90 | #
|
---|
91 | # *Note*
|
---|
92 | # This method decode MIME encoded string and
|
---|
93 | # convert halfwidth katakana to fullwidth katakana.
|
---|
94 | # If you don't want to decode them, use NKF.nkf.
|
---|
95 | def kconv(str, out_code, in_code = AUTO)
|
---|
96 | opt = '-'
|
---|
97 | case in_code
|
---|
98 | when ::NKF::JIS
|
---|
99 | opt << 'J'
|
---|
100 | when ::NKF::EUC
|
---|
101 | opt << 'E'
|
---|
102 | when ::NKF::SJIS
|
---|
103 | opt << 'S'
|
---|
104 | when ::NKF::UTF8
|
---|
105 | opt << 'W'
|
---|
106 | when ::NKF::UTF16
|
---|
107 | opt << 'W16'
|
---|
108 | end
|
---|
109 |
|
---|
110 | case out_code
|
---|
111 | when ::NKF::JIS
|
---|
112 | opt << 'j'
|
---|
113 | when ::NKF::EUC
|
---|
114 | opt << 'e'
|
---|
115 | when ::NKF::SJIS
|
---|
116 | opt << 's'
|
---|
117 | when ::NKF::UTF8
|
---|
118 | opt << 'w'
|
---|
119 | when ::NKF::UTF16
|
---|
120 | opt << 'w16'
|
---|
121 | when ::NKF::NOCONV
|
---|
122 | return str
|
---|
123 | end
|
---|
124 |
|
---|
125 | opt = '' if opt == '-'
|
---|
126 |
|
---|
127 | ::NKF::nkf(opt, str)
|
---|
128 | end
|
---|
129 | module_function :kconv
|
---|
130 |
|
---|
131 | #
|
---|
132 | # Encode to
|
---|
133 | #
|
---|
134 |
|
---|
135 | # call-seq:
|
---|
136 | # Kconv.tojis(str) -> string
|
---|
137 | #
|
---|
138 | # Convert <code>str</code> to ISO-2022-JP
|
---|
139 | #
|
---|
140 | # *Note*
|
---|
141 | # This method decode MIME encoded string and
|
---|
142 | # convert halfwidth katakana to fullwidth katakana.
|
---|
143 | # If you don't want it, use NKF.nkf('-jxm0', str).
|
---|
144 | def tojis(str)
|
---|
145 | ::NKF::nkf('-jm', str)
|
---|
146 | end
|
---|
147 | module_function :tojis
|
---|
148 |
|
---|
149 | # call-seq:
|
---|
150 | # Kconv.toeuc(str) -> string
|
---|
151 | #
|
---|
152 | # Convert <code>str</code> to EUC-JP
|
---|
153 | #
|
---|
154 | # *Note*
|
---|
155 | # This method decode MIME encoded string and
|
---|
156 | # convert halfwidth katakana to fullwidth katakana.
|
---|
157 | # If you don't want it, use NKF.nkf('-exm0', str).
|
---|
158 | def toeuc(str)
|
---|
159 | ::NKF::nkf('-em', str)
|
---|
160 | end
|
---|
161 | module_function :toeuc
|
---|
162 |
|
---|
163 | # call-seq:
|
---|
164 | # Kconv.tosjis(str) -> string
|
---|
165 | #
|
---|
166 | # Convert <code>str</code> to Shift_JIS
|
---|
167 | #
|
---|
168 | # *Note*
|
---|
169 | # This method decode MIME encoded string and
|
---|
170 | # convert halfwidth katakana to fullwidth katakana.
|
---|
171 | # If you don't want it, use NKF.nkf('-sxm0', str).
|
---|
172 | def tosjis(str)
|
---|
173 | ::NKF::nkf('-sm', str)
|
---|
174 | end
|
---|
175 | module_function :tosjis
|
---|
176 |
|
---|
177 | # call-seq:
|
---|
178 | # Kconv.toutf8(str) -> string
|
---|
179 | #
|
---|
180 | # Convert <code>str</code> to UTF-8
|
---|
181 | #
|
---|
182 | # *Note*
|
---|
183 | # This method decode MIME encoded string and
|
---|
184 | # convert halfwidth katakana to fullwidth katakana.
|
---|
185 | # If you don't want it, use NKF.nkf('-wxm0', str).
|
---|
186 | def toutf8(str)
|
---|
187 | ::NKF::nkf('-wm', str)
|
---|
188 | end
|
---|
189 | module_function :toutf8
|
---|
190 |
|
---|
191 | # call-seq:
|
---|
192 | # Kconv.toutf16(str) -> string
|
---|
193 | #
|
---|
194 | # Convert <code>str</code> to UTF-16
|
---|
195 | #
|
---|
196 | # *Note*
|
---|
197 | # This method decode MIME encoded string and
|
---|
198 | # convert halfwidth katakana to fullwidth katakana.
|
---|
199 | # If you don't want it, use NKF.nkf('-w16xm0', str).
|
---|
200 | def toutf16(str)
|
---|
201 | ::NKF::nkf('-w16m', str)
|
---|
202 | end
|
---|
203 | module_function :toutf16
|
---|
204 |
|
---|
205 | #
|
---|
206 | # guess
|
---|
207 | #
|
---|
208 |
|
---|
209 | # call-seq:
|
---|
210 | # Kconv.guess(str) -> integer
|
---|
211 | #
|
---|
212 | # Guess input encoding by NKF.guess2
|
---|
213 | def guess(str)
|
---|
214 | ::NKF::guess(str)
|
---|
215 | end
|
---|
216 | module_function :guess
|
---|
217 |
|
---|
218 | # call-seq:
|
---|
219 | # Kconv.guess_old(str) -> integer
|
---|
220 | #
|
---|
221 | # Guess input encoding by NKF.guess1
|
---|
222 | def guess_old(str)
|
---|
223 | ::NKF::guess1(str)
|
---|
224 | end
|
---|
225 | module_function :guess_old
|
---|
226 |
|
---|
227 | #
|
---|
228 | # isEncoding
|
---|
229 | #
|
---|
230 |
|
---|
231 | # call-seq:
|
---|
232 | # Kconv.iseuc(str) -> obj or nil
|
---|
233 | #
|
---|
234 | # Returns whether input encoding is EUC-JP or not.
|
---|
235 | #
|
---|
236 | # *Note* don't expect this return value is MatchData.
|
---|
237 | def iseuc(str)
|
---|
238 | RegexpEucjp.match( str )
|
---|
239 | end
|
---|
240 | module_function :iseuc
|
---|
241 |
|
---|
242 | # call-seq:
|
---|
243 | # Kconv.issjis(str) -> obj or nil
|
---|
244 | #
|
---|
245 | # Returns whether input encoding is Shift_JIS or not.
|
---|
246 | #
|
---|
247 | # *Note* don't expect this return value is MatchData.
|
---|
248 | def issjis(str)
|
---|
249 | RegexpShiftjis.match( str )
|
---|
250 | end
|
---|
251 | module_function :issjis
|
---|
252 |
|
---|
253 | # call-seq:
|
---|
254 | # Kconv.isutf8(str) -> obj or nil
|
---|
255 | #
|
---|
256 | # Returns whether input encoding is UTF-8 or not.
|
---|
257 | #
|
---|
258 | # *Note* don't expect this return value is MatchData.
|
---|
259 | def isutf8(str)
|
---|
260 | RegexpUtf8.match( str )
|
---|
261 | end
|
---|
262 | module_function :isutf8
|
---|
263 |
|
---|
264 | end
|
---|
265 |
|
---|
266 | class String
|
---|
267 | # call-seq:
|
---|
268 | # String#kconv(out_code, in_code = Kconv::AUTO)
|
---|
269 | #
|
---|
270 | # Convert <code>self</code> to out_code.
|
---|
271 | # <code>out_code</code> and <code>in_code</code> are given as constants of Kconv.
|
---|
272 | #
|
---|
273 | # *Note*
|
---|
274 | # This method decode MIME encoded string and
|
---|
275 | # convert halfwidth katakana to fullwidth katakana.
|
---|
276 | # If you don't want to decode them, use NKF.nkf.
|
---|
277 | def kconv(out_code, in_code=Kconv::AUTO)
|
---|
278 | Kconv::kconv(self, out_code, in_code)
|
---|
279 | end
|
---|
280 |
|
---|
281 | #
|
---|
282 | # to Encoding
|
---|
283 | #
|
---|
284 |
|
---|
285 | # call-seq:
|
---|
286 | # String#tojis -> string
|
---|
287 | #
|
---|
288 | # Convert <code>self</code> to ISO-2022-JP
|
---|
289 | #
|
---|
290 | # *Note*
|
---|
291 | # This method decode MIME encoded string and
|
---|
292 | # convert halfwidth katakana to fullwidth katakana.
|
---|
293 | # If you don't want it, use NKF.nkf('-jxm0', str).
|
---|
294 | def tojis; Kconv.tojis(self) end
|
---|
295 |
|
---|
296 | # call-seq:
|
---|
297 | # String#toeuc -> string
|
---|
298 | #
|
---|
299 | # Convert <code>self</code> to EUC-JP
|
---|
300 | #
|
---|
301 | # *Note*
|
---|
302 | # This method decode MIME encoded string and
|
---|
303 | # convert halfwidth katakana to fullwidth katakana.
|
---|
304 | # If you don't want it, use NKF.nkf('-exm0', str).
|
---|
305 | def toeuc; Kconv.toeuc(self) end
|
---|
306 |
|
---|
307 | # call-seq:
|
---|
308 | # String#tosjis -> string
|
---|
309 | #
|
---|
310 | # Convert <code>self</code> to Shift_JIS
|
---|
311 | #
|
---|
312 | # *Note*
|
---|
313 | # This method decode MIME encoded string and
|
---|
314 | # convert halfwidth katakana to fullwidth katakana.
|
---|
315 | # If you don't want it, use NKF.nkf('-sxm0', str).
|
---|
316 | def tosjis; Kconv.tosjis(self) end
|
---|
317 |
|
---|
318 | # call-seq:
|
---|
319 | # String#toutf8 -> string
|
---|
320 | #
|
---|
321 | # Convert <code>self</code> to UTF-8
|
---|
322 | #
|
---|
323 | # *Note*
|
---|
324 | # This method decode MIME encoded string and
|
---|
325 | # convert halfwidth katakana to fullwidth katakana.
|
---|
326 | # If you don't want it, use NKF.nkf('-wxm0', str).
|
---|
327 | def toutf8; Kconv.toutf8(self) end
|
---|
328 |
|
---|
329 | # call-seq:
|
---|
330 | # String#toutf16 -> string
|
---|
331 | #
|
---|
332 | # Convert <code>self</code> to UTF-16
|
---|
333 | #
|
---|
334 | # *Note*
|
---|
335 | # This method decode MIME encoded string and
|
---|
336 | # convert halfwidth katakana to fullwidth katakana.
|
---|
337 | # If you don't want it, use NKF.nkf('-w16xm0', str).
|
---|
338 | def toutf16; Kconv.toutf16(self) end
|
---|
339 |
|
---|
340 | #
|
---|
341 | # is Encoding
|
---|
342 | #
|
---|
343 |
|
---|
344 | # call-seq:
|
---|
345 | # String#iseuc -> obj or nil
|
---|
346 | #
|
---|
347 | # Returns whether <code>self</code>'s encoding is EUC-JP or not.
|
---|
348 | #
|
---|
349 | # *Note* don't expect this return value is MatchData.
|
---|
350 | def iseuc; Kconv.iseuc(self) end
|
---|
351 |
|
---|
352 | # call-seq:
|
---|
353 | # String#issjis -> obj or nil
|
---|
354 | #
|
---|
355 | # Returns whether <code>self</code>'s encoding is Shift_JIS or not.
|
---|
356 | #
|
---|
357 | # *Note* don't expect this return value is MatchData.
|
---|
358 | def issjis; Kconv.issjis(self) end
|
---|
359 |
|
---|
360 | # call-seq:
|
---|
361 | # String#isutf8 -> obj or nil
|
---|
362 | #
|
---|
363 | # Returns whether <code>self</code>'s encoding is UTF-8 or not.
|
---|
364 | #
|
---|
365 | # *Note* don't expect this return value is MatchData.
|
---|
366 | def isutf8; Kconv.isutf8(self) end
|
---|
367 | end
|
---|