1 | # jcode.rb - ruby code to handle japanese (EUC/SJIS) string
|
---|
2 |
|
---|
3 | if $VERBOSE && $KCODE == "NONE"
|
---|
4 | warn "Warning: $KCODE is NONE."
|
---|
5 | end
|
---|
6 |
|
---|
7 | $vsave, $VERBOSE = $VERBOSE, false
|
---|
8 | class String
|
---|
9 | warn "feel free for some warnings:\n" if $VERBOSE
|
---|
10 |
|
---|
11 | def _regex_quote(str)
|
---|
12 | str.gsub(/(\\[\[\]\-\\])|\\(.)|([\[\]\\])/) do
|
---|
13 | $1 || $2 || '\\' + $3
|
---|
14 | end
|
---|
15 | end
|
---|
16 | private :_regex_quote
|
---|
17 |
|
---|
18 | PATTERN_SJIS = '[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]'
|
---|
19 | PATTERN_EUC = '[\xa1-\xfe][\xa1-\xfe]'
|
---|
20 | PATTERN_UTF8 = '[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]'
|
---|
21 |
|
---|
22 | RE_SJIS = Regexp.new(PATTERN_SJIS, 0, 'n')
|
---|
23 | RE_EUC = Regexp.new(PATTERN_EUC, 0, 'n')
|
---|
24 | RE_UTF8 = Regexp.new(PATTERN_UTF8, 0, 'n')
|
---|
25 |
|
---|
26 | SUCC = {}
|
---|
27 | SUCC['s'] = Hash.new(1)
|
---|
28 | for i in 0 .. 0x3f
|
---|
29 | SUCC['s'][i.chr] = 0x40 - i
|
---|
30 | end
|
---|
31 | SUCC['s']["\x7e"] = 0x80 - 0x7e
|
---|
32 | SUCC['s']["\xfd"] = 0x100 - 0xfd
|
---|
33 | SUCC['s']["\xfe"] = 0x100 - 0xfe
|
---|
34 | SUCC['s']["\xff"] = 0x100 - 0xff
|
---|
35 | SUCC['e'] = Hash.new(1)
|
---|
36 | for i in 0 .. 0xa0
|
---|
37 | SUCC['e'][i.chr] = 0xa1 - i
|
---|
38 | end
|
---|
39 | SUCC['e']["\xfe"] = 2
|
---|
40 | SUCC['u'] = Hash.new(1)
|
---|
41 | for i in 0 .. 0x7f
|
---|
42 | SUCC['u'][i.chr] = 0x80 - i
|
---|
43 | end
|
---|
44 | SUCC['u']["\xbf"] = 0x100 - 0xbf
|
---|
45 |
|
---|
46 | def mbchar?
|
---|
47 | case $KCODE[0]
|
---|
48 | when ?s, ?S
|
---|
49 | self =~ RE_SJIS
|
---|
50 | when ?e, ?E
|
---|
51 | self =~ RE_EUC
|
---|
52 | when ?u, ?U
|
---|
53 | self =~ RE_UTF8
|
---|
54 | else
|
---|
55 | nil
|
---|
56 | end
|
---|
57 | end
|
---|
58 |
|
---|
59 | def end_regexp
|
---|
60 | case $KCODE[0]
|
---|
61 | when ?s, ?S
|
---|
62 | /#{PATTERN_SJIS}$/on
|
---|
63 | when ?e, ?E
|
---|
64 | /#{PATTERN_EUC}$/on
|
---|
65 | when ?u, ?U
|
---|
66 | /#{PATTERN_UTF8}$/on
|
---|
67 | else
|
---|
68 | /.$/on
|
---|
69 | end
|
---|
70 | end
|
---|
71 |
|
---|
72 | alias original_succ! succ!
|
---|
73 | private :original_succ!
|
---|
74 |
|
---|
75 | alias original_succ succ
|
---|
76 | private :original_succ
|
---|
77 |
|
---|
78 | def succ!
|
---|
79 | reg = end_regexp
|
---|
80 | if $KCODE != 'NONE' && self =~ reg
|
---|
81 | succ_table = SUCC[$KCODE[0,1].downcase]
|
---|
82 | begin
|
---|
83 | self[-1] += succ_table[self[-1]]
|
---|
84 | self[-2] += 1 if self[-1] == 0
|
---|
85 | end while self !~ reg
|
---|
86 | self
|
---|
87 | else
|
---|
88 | original_succ!
|
---|
89 | end
|
---|
90 | end
|
---|
91 |
|
---|
92 | def succ
|
---|
93 | str = self.dup
|
---|
94 | str.succ! or str
|
---|
95 | end
|
---|
96 |
|
---|
97 | private
|
---|
98 |
|
---|
99 | def _expand_ch str
|
---|
100 | a = []
|
---|
101 | str.scan(/(?:\\(.)|([^\\]))-(?:\\(.)|([^\\]))|(?:\\(.)|(.))/m) do
|
---|
102 | from = $1 || $2
|
---|
103 | to = $3 || $4
|
---|
104 | one = $5 || $6
|
---|
105 | if one
|
---|
106 | a.push one
|
---|
107 | elsif from.length != to.length
|
---|
108 | next
|
---|
109 | elsif from.length == 1
|
---|
110 | from[0].upto(to[0]) { |c| a.push c.chr }
|
---|
111 | else
|
---|
112 | from.upto(to) { |c| a.push c }
|
---|
113 | end
|
---|
114 | end
|
---|
115 | a
|
---|
116 | end
|
---|
117 |
|
---|
118 | def expand_ch_hash from, to
|
---|
119 | h = {}
|
---|
120 | afrom = _expand_ch(from)
|
---|
121 | ato = _expand_ch(to)
|
---|
122 | afrom.each_with_index do |x,i| h[x] = ato[i] || ato[-1] end
|
---|
123 | h
|
---|
124 | end
|
---|
125 |
|
---|
126 | HashCache = {}
|
---|
127 | TrPatternCache = {}
|
---|
128 | DeletePatternCache = {}
|
---|
129 | SqueezePatternCache = {}
|
---|
130 |
|
---|
131 | public
|
---|
132 |
|
---|
133 | def tr!(from, to)
|
---|
134 | return nil if from == ""
|
---|
135 | return self.delete!(from) if to == ""
|
---|
136 |
|
---|
137 | pattern = TrPatternCache[from] ||= /[#{_regex_quote(from)}]/
|
---|
138 | if from[0] == ?^
|
---|
139 | last = /.$/.match(to)[0]
|
---|
140 | self.gsub!(pattern, last)
|
---|
141 | else
|
---|
142 | h = HashCache[from + "1-0" + to] ||= expand_ch_hash(from, to)
|
---|
143 | self.gsub!(pattern) do |c| h[c] end
|
---|
144 | end
|
---|
145 | end
|
---|
146 |
|
---|
147 | def tr(from, to)
|
---|
148 | (str = self.dup).tr!(from, to) or str
|
---|
149 | end
|
---|
150 |
|
---|
151 | def delete!(del)
|
---|
152 | return nil if del == ""
|
---|
153 | self.gsub!(DeletePatternCache[del] ||= /[#{_regex_quote(del)}]+/, '')
|
---|
154 | end
|
---|
155 |
|
---|
156 | def delete(del)
|
---|
157 | (str = self.dup).delete!(del) or str
|
---|
158 | end
|
---|
159 |
|
---|
160 | def squeeze!(del=nil)
|
---|
161 | return nil if del == ""
|
---|
162 | pattern =
|
---|
163 | if del
|
---|
164 | SqueezePatternCache[del] ||= /([#{_regex_quote(del)}])\1+/
|
---|
165 | else
|
---|
166 | /(.|\n)\1+/
|
---|
167 | end
|
---|
168 | self.gsub!(pattern, '\1')
|
---|
169 | end
|
---|
170 |
|
---|
171 | def squeeze(del=nil)
|
---|
172 | (str = self.dup).squeeze!(del) or str
|
---|
173 | end
|
---|
174 |
|
---|
175 | def tr_s!(from, to)
|
---|
176 | return self.delete!(from) if to.length == 0
|
---|
177 |
|
---|
178 | pattern = SqueezePatternCache[from] ||= /([#{_regex_quote(from)}])\1*/
|
---|
179 | if from[0] == ?^
|
---|
180 | last = /.$/.match(to)[0]
|
---|
181 | self.gsub!(pattern, last)
|
---|
182 | else
|
---|
183 | h = HashCache[from + "1-0" + to] ||= expand_ch_hash(from, to)
|
---|
184 | self.gsub!(pattern) do h[$1] end
|
---|
185 | end
|
---|
186 | end
|
---|
187 |
|
---|
188 | def tr_s(from, to)
|
---|
189 | (str = self.dup).tr_s!(from,to) or str
|
---|
190 | end
|
---|
191 |
|
---|
192 | def chop!
|
---|
193 | self.gsub!(/(?:.|\r?\n)\z/, '')
|
---|
194 | end
|
---|
195 |
|
---|
196 | def chop
|
---|
197 | (str = self.dup).chop! or str
|
---|
198 | end
|
---|
199 |
|
---|
200 | def jlength
|
---|
201 | self.gsub(/[^\Wa-zA-Z_\d]/, ' ').length
|
---|
202 | end
|
---|
203 | alias jsize jlength
|
---|
204 |
|
---|
205 | def jcount(str)
|
---|
206 | self.delete("^#{str}").jlength
|
---|
207 | end
|
---|
208 |
|
---|
209 | def each_char
|
---|
210 | if block_given?
|
---|
211 | scan(/./m) do |x|
|
---|
212 | yield x
|
---|
213 | end
|
---|
214 | else
|
---|
215 | scan(/./m)
|
---|
216 | end
|
---|
217 | end
|
---|
218 |
|
---|
219 | end
|
---|
220 | $VERBOSE = $vsave
|
---|