source: documentation/trunk/packages/dokuwiki-2011-05-25a/inc/utf8.php@ 30098

Last change on this file since 30098 was 25027, checked in by jmt12, 12 years ago

Adding the packages directory, and within it a configured version of dokuwiki all ready to run

File size: 83.4 KB
Line 
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author Andreas Gohr <[email protected]>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14 define('UTF8_MBSTRING',1);
15 }else{
16 define('UTF8_MBSTRING',0);
17 }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_isASCII')){
23 /**
24 * Checks if a string contains 7bit ASCII only
25 *
26 * @author Andreas Haerter <[email protected]>
27 */
28 function utf8_isASCII($str){
29 return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
30 }
31}
32
33if(!function_exists('utf8_strip')){
34 /**
35 * Strips all highbyte chars
36 *
37 * Returns a pure ASCII7 string
38 *
39 * @author Andreas Gohr <[email protected]>
40 */
41 function utf8_strip($str){
42 $ascii = '';
43 $len = strlen($str);
44 for($i=0; $i<$len; $i++){
45 if(ord($str{$i}) <128){
46 $ascii .= $str{$i};
47 }
48 }
49 return $ascii;
50 }
51}
52
53if(!function_exists('utf8_check')){
54 /**
55 * Tries to detect if a string is in Unicode encoding
56 *
57 * @author <[email protected]>
58 * @link http://www.php.net/manual/en/function.utf8-encode.php
59 */
60 function utf8_check($Str) {
61 $len = strlen($Str);
62 for ($i=0; $i<$len; $i++) {
63 $b = ord($Str[$i]);
64 if ($b < 0x80) continue; # 0bbbbbbb
65 elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
66 elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
67 elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
68 elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
69 elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
70 else return false; # Does not match any model
71
72 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
73 if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
74 return false;
75 }
76 }
77 return true;
78 }
79}
80
81if(!function_exists('utf8_strlen')){
82 /**
83 * Unicode aware replacement for strlen()
84 *
85 * utf8_decode() converts characters that are not in ISO-8859-1
86 * to '?', which, for the purpose of counting, is alright - It's
87 * even faster than mb_strlen.
88 *
89 * @author <chernyshevsky at hotmail dot com>
90 * @see strlen()
91 * @see utf8_decode()
92 */
93 function utf8_strlen($string){
94 return strlen(utf8_decode($string));
95 }
96}
97
98if(!function_exists('utf8_substr')){
99 /**
100 * UTF-8 aware alternative to substr
101 *
102 * Return part of a string given character offset (and optionally length)
103 *
104 * @author Harry Fuecks <[email protected]>
105 * @author Chris Smith <[email protected]>
106 * @param string
107 * @param integer number of UTF-8 characters offset (from left)
108 * @param integer (optional) length in UTF-8 characters from offset
109 * @return mixed string or false if failure
110 */
111 function utf8_substr($str, $offset, $length = null) {
112 if(UTF8_MBSTRING){
113 if( $length === null ){
114 return mb_substr($str, $offset);
115 }else{
116 return mb_substr($str, $offset, $length);
117 }
118 }
119
120 /*
121 * Notes:
122 *
123 * no mb string support, so we'll use pcre regex's with 'u' flag
124 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
125 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
126 *
127 * substr documentation states false can be returned in some cases (e.g. offset > string length)
128 * mb_substr never returns false, it will return an empty string instead.
129 *
130 * calculating the number of characters in the string is a relatively expensive operation, so
131 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
132 */
133
134 // cast parameters to appropriate types to avoid multiple notices/warnings
135 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects
136 $offset = (int)$offset;
137 if (!is_null($length)) $length = (int)$length;
138
139 // handle trivial cases
140 if ($length === 0) return '';
141 if ($offset < 0 && $length < 0 && $length < $offset) return '';
142
143 $offset_pattern = '';
144 $length_pattern = '';
145
146 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
147 if ($offset < 0) {
148 $strlen = strlen(utf8_decode($str)); // see notes
149 $offset = $strlen + $offset;
150 if ($offset < 0) $offset = 0;
151 }
152
153 // establish a pattern for offset, a non-captured group equal in length to offset
154 if ($offset > 0) {
155 $Ox = (int)($offset/65535);
156 $Oy = $offset%65535;
157
158 if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
159 $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
160 } else {
161 $offset_pattern = '^'; // offset == 0; just anchor the pattern
162 }
163
164 // establish a pattern for length
165 if (is_null($length)) {
166 $length_pattern = '(.*)$'; // the rest of the string
167 } else {
168
169 if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes
170 if ($offset > $strlen) return ''; // another trivial case
171
172 if ($length > 0) {
173
174 $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string
175
176 $Lx = (int)($length/65535);
177 $Ly = $length%65535;
178
179 // +ve length requires ... a captured group of length characters
180 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
181 $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
182
183 } else if ($length < 0) {
184
185 if ($length < ($offset - $strlen)) return '';
186
187 $Lx = (int)((-$length)/65535);
188 $Ly = (-$length)%65535;
189
190 // -ve length requires ... capture everything except a group of -length characters
191 // anchored at the tail-end of the string
192 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
193 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
194 }
195 }
196
197 if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
198 return $match[1];
199 }
200}
201
202if(!function_exists('utf8_substr_replace')){
203 /**
204 * Unicode aware replacement for substr_replace()
205 *
206 * @author Andreas Gohr <[email protected]>
207 * @see substr_replace()
208 */
209 function utf8_substr_replace($string, $replacement, $start , $length=0 ){
210 $ret = '';
211 if($start>0) $ret .= utf8_substr($string, 0, $start);
212 $ret .= $replacement;
213 $ret .= utf8_substr($string, $start+$length);
214 return $ret;
215 }
216}
217
218if(!function_exists('utf8_ltrim')){
219 /**
220 * Unicode aware replacement for ltrim()
221 *
222 * @author Andreas Gohr <[email protected]>
223 * @see ltrim()
224 * @return string
225 */
226 function utf8_ltrim($str,$charlist=''){
227 if($charlist == '') return ltrim($str);
228
229 //quote charlist for use in a characterclass
230 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
231
232 return preg_replace('/^['.$charlist.']+/u','',$str);
233 }
234}
235
236if(!function_exists('utf8_rtrim')){
237 /**
238 * Unicode aware replacement for rtrim()
239 *
240 * @author Andreas Gohr <[email protected]>
241 * @see rtrim()
242 * @return string
243 */
244 function utf8_rtrim($str,$charlist=''){
245 if($charlist == '') return rtrim($str);
246
247 //quote charlist for use in a characterclass
248 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
249
250 return preg_replace('/['.$charlist.']+$/u','',$str);
251 }
252}
253
254if(!function_exists('utf8_trim')){
255 /**
256 * Unicode aware replacement for trim()
257 *
258 * @author Andreas Gohr <[email protected]>
259 * @see trim()
260 * @return string
261 */
262 function utf8_trim($str,$charlist='') {
263 if($charlist == '') return trim($str);
264
265 return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
266 }
267}
268
269if(!function_exists('utf8_strtolower')){
270 /**
271 * This is a unicode aware replacement for strtolower()
272 *
273 * Uses mb_string extension if available
274 *
275 * @author Leo Feyer <[email protected]>
276 * @see strtolower()
277 * @see utf8_strtoupper()
278 */
279 function utf8_strtolower($string){
280 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
281
282 global $UTF8_UPPER_TO_LOWER;
283 return strtr($string,$UTF8_UPPER_TO_LOWER);
284 }
285}
286
287if(!function_exists('utf8_strtoupper')){
288 /**
289 * This is a unicode aware replacement for strtoupper()
290 *
291 * Uses mb_string extension if available
292 *
293 * @author Leo Feyer <[email protected]>
294 * @see strtoupper()
295 * @see utf8_strtoupper()
296 */
297 function utf8_strtoupper($string){
298 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
299
300 global $UTF8_LOWER_TO_UPPER;
301 return strtr($string,$UTF8_LOWER_TO_UPPER);
302 }
303}
304
305if(!function_exists('utf8_ucfirst')){
306 /**
307 * UTF-8 aware alternative to ucfirst
308 * Make a string's first character uppercase
309 *
310 * @author Harry Fuecks
311 * @param string
312 * @return string with first character as upper case (if applicable)
313 */
314 function utf8_ucfirst($str){
315 switch ( utf8_strlen($str) ) {
316 case 0:
317 return '';
318 case 1:
319 return utf8_strtoupper($str);
320 default:
321 preg_match('/^(.{1})(.*)$/us', $str, $matches);
322 return utf8_strtoupper($matches[1]).$matches[2];
323 }
324 }
325}
326
327if(!function_exists('utf8_ucwords')){
328 /**
329 * UTF-8 aware alternative to ucwords
330 * Uppercase the first character of each word in a string
331 *
332 * @author Harry Fuecks
333 * @param string
334 * @return string with first char of each word uppercase
335 * @see http://www.php.net/ucwords
336 */
337 function utf8_ucwords($str) {
338 // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
339 // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
340 // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
341 $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
342
343 return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
344 }
345
346 /**
347 * Callback function for preg_replace_callback call in utf8_ucwords
348 * You don't need to call this yourself
349 *
350 * @author Harry Fuecks
351 * @param array of matches corresponding to a single word
352 * @return string with first char of the word in uppercase
353 * @see utf8_ucwords
354 * @see utf8_strtoupper
355 */
356 function utf8_ucwords_callback($matches) {
357 $leadingws = $matches[2];
358 $ucfirst = utf8_strtoupper($matches[3]);
359 $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
360 return $leadingws . $ucword;
361 }
362}
363
364if(!function_exists('utf8_deaccent')){
365 /**
366 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
367 *
368 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
369 * letters. Default is to deaccent both cases ($case = 0)
370 *
371 * @author Andreas Gohr <[email protected]>
372 */
373 function utf8_deaccent($string,$case=0){
374 if($case <= 0){
375 global $UTF8_LOWER_ACCENTS;
376 $string = strtr($string,$UTF8_LOWER_ACCENTS);
377 }
378 if($case >= 0){
379 global $UTF8_UPPER_ACCENTS;
380 $string = strtr($string,$UTF8_UPPER_ACCENTS);
381 }
382 return $string;
383 }
384}
385
386if(!function_exists('utf8_romanize')){
387 /**
388 * Romanize a non-latin string
389 *
390 * @author Andreas Gohr <[email protected]>
391 */
392 function utf8_romanize($string){
393 if(utf8_isASCII($string)) return $string; //nothing to do
394
395 global $UTF8_ROMANIZATION;
396 return strtr($string,$UTF8_ROMANIZATION);
397 }
398}
399
400if(!function_exists('utf8_stripspecials')){
401 /**
402 * Removes special characters (nonalphanumeric) from a UTF-8 string
403 *
404 * This function adds the controlchars 0x00 to 0x19 to the array of
405 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
406 *
407 * @author Andreas Gohr <[email protected]>
408 * @param string $string The UTF8 string to strip of special chars
409 * @param string $repl Replace special with this string
410 * @param string $additional Additional chars to strip (used in regexp char class)
411 */
412 function utf8_stripspecials($string,$repl='',$additional=''){
413 global $UTF8_SPECIAL_CHARS;
414 global $UTF8_SPECIAL_CHARS2;
415
416 static $specials = null;
417 if(is_null($specials)){
418 #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
419 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
420 }
421
422 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
423 }
424}
425
426if(!function_exists('utf8_strpos')){
427 /**
428 * This is an Unicode aware replacement for strpos
429 *
430 * @author Leo Feyer <[email protected]>
431 * @see strpos()
432 * @param string
433 * @param string
434 * @param integer
435 * @return integer
436 */
437 function utf8_strpos($haystack, $needle, $offset=0){
438 $comp = 0;
439 $length = null;
440
441 while (is_null($length) || $length < $offset) {
442 $pos = strpos($haystack, $needle, $offset + $comp);
443
444 if ($pos === false)
445 return false;
446
447 $length = utf8_strlen(substr($haystack, 0, $pos));
448
449 if ($length < $offset)
450 $comp = $pos - $length;
451 }
452
453 return $length;
454 }
455}
456
457if(!function_exists('utf8_tohtml')){
458 /**
459 * Encodes UTF-8 characters to HTML entities
460 *
461 * @author Tom N Harris <[email protected]>
462 * @author <vpribish at shopping dot com>
463 * @link http://www.php.net/manual/en/function.utf8-decode.php
464 */
465 function utf8_tohtml ($str) {
466 $ret = '';
467 foreach (utf8_to_unicode($str) as $cp) {
468 if ($cp < 0x80)
469 $ret .= chr($cp);
470 elseif ($cp < 0x100)
471 $ret .= "&#$cp;";
472 else
473 $ret .= '&#x'.dechex($cp).';';
474 }
475 return $ret;
476 }
477}
478
479if(!function_exists('utf8_unhtml')){
480 /**
481 * Decodes HTML entities to UTF-8 characters
482 *
483 * Convert any &#..; entity to a codepoint,
484 * The entities flag defaults to only decoding numeric entities.
485 * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
486 * are handled as well. Avoids the problem that would occur if you
487 * had to decode "&amp;#38;&#38;amp;#38;"
488 *
489 * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
490 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
491 * what it should be -> "&#38;&amp#38;"
492 *
493 * @author Tom N Harris <[email protected]>
494 * @param string $str UTF-8 encoded string
495 * @param boolean $entities Flag controlling decoding of named entities.
496 * @return UTF-8 encoded string with numeric (and named) entities replaced.
497 */
498 function utf8_unhtml($str, $entities=null) {
499 static $decoder = null;
500 if (is_null($decoder))
501 $decoder = new utf8_entity_decoder();
502 if (is_null($entities))
503 return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
504 'utf8_decode_numeric', $str);
505 else
506 return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
507 array(&$decoder, 'decode'), $str);
508 }
509}
510
511if(!function_exists('utf8_decode_numeric')){
512 function utf8_decode_numeric($ent) {
513 switch ($ent[2]) {
514 case 'X':
515 case 'x':
516 $cp = hexdec($ent[3]);
517 break;
518 default:
519 $cp = intval($ent[3]);
520 break;
521 }
522 return unicode_to_utf8(array($cp));
523 }
524}
525
526if(!class_exists('utf8_entity_decoder')){
527 class utf8_entity_decoder {
528 var $table;
529 function utf8_entity_decoder() {
530 $table = get_html_translation_table(HTML_ENTITIES);
531 $table = array_flip($table);
532 $this->table = array_map(array(&$this,'makeutf8'), $table);
533 }
534 function makeutf8($c) {
535 return unicode_to_utf8(array(ord($c)));
536 }
537 function decode($ent) {
538 if ($ent[1] == '#') {
539 return utf8_decode_numeric($ent);
540 } elseif (array_key_exists($ent[0],$this->table)) {
541 return $this->table[$ent[0]];
542 } else {
543 return $ent[0];
544 }
545 }
546 }
547}
548
549if(!function_exists('utf8_to_unicode')){
550 /**
551 * Takes an UTF-8 string and returns an array of ints representing the
552 * Unicode characters. Astral planes are supported ie. the ints in the
553 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
554 * are not allowed.
555 *
556 * If $strict is set to true the function returns false if the input
557 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
558 * level E_USER_WARNING
559 *
560 * Note: this function has been modified slightly in this library to
561 * trigger errors on encountering bad bytes
562 *
563 * @author <[email protected]>
564 * @author Harry Fuecks <[email protected]>
565 * @param string UTF-8 encoded string
566 * @param boolean Check for invalid sequences?
567 * @return mixed array of unicode code points or false if UTF-8 invalid
568 * @see unicode_to_utf8
569 * @link http://hsivonen.iki.fi/php-utf8/
570 * @link http://sourceforge.net/projects/phputf8/
571 */
572 function utf8_to_unicode($str,$strict=false) {
573 $mState = 0; // cached expected number of octets after the current octet
574 // until the beginning of the next UTF8 character sequence
575 $mUcs4 = 0; // cached Unicode character
576 $mBytes = 1; // cached expected number of octets in the current sequence
577
578 $out = array();
579
580 $len = strlen($str);
581
582 for($i = 0; $i < $len; $i++) {
583
584 $in = ord($str{$i});
585
586 if ( $mState == 0) {
587
588 // When mState is zero we expect either a US-ASCII character or a
589 // multi-octet sequence.
590 if (0 == (0x80 & ($in))) {
591 // US-ASCII, pass straight through.
592 $out[] = $in;
593 $mBytes = 1;
594
595 } else if (0xC0 == (0xE0 & ($in))) {
596 // First octet of 2 octet sequence
597 $mUcs4 = ($in);
598 $mUcs4 = ($mUcs4 & 0x1F) << 6;
599 $mState = 1;
600 $mBytes = 2;
601
602 } else if (0xE0 == (0xF0 & ($in))) {
603 // First octet of 3 octet sequence
604 $mUcs4 = ($in);
605 $mUcs4 = ($mUcs4 & 0x0F) << 12;
606 $mState = 2;
607 $mBytes = 3;
608
609 } else if (0xF0 == (0xF8 & ($in))) {
610 // First octet of 4 octet sequence
611 $mUcs4 = ($in);
612 $mUcs4 = ($mUcs4 & 0x07) << 18;
613 $mState = 3;
614 $mBytes = 4;
615
616 } else if (0xF8 == (0xFC & ($in))) {
617 /* First octet of 5 octet sequence.
618 *
619 * This is illegal because the encoded codepoint must be either
620 * (a) not the shortest form or
621 * (b) outside the Unicode range of 0-0x10FFFF.
622 * Rather than trying to resynchronize, we will carry on until the end
623 * of the sequence and let the later error handling code catch it.
624 */
625 $mUcs4 = ($in);
626 $mUcs4 = ($mUcs4 & 0x03) << 24;
627 $mState = 4;
628 $mBytes = 5;
629
630 } else if (0xFC == (0xFE & ($in))) {
631 // First octet of 6 octet sequence, see comments for 5 octet sequence.
632 $mUcs4 = ($in);
633 $mUcs4 = ($mUcs4 & 1) << 30;
634 $mState = 5;
635 $mBytes = 6;
636
637 } elseif($strict) {
638 /* Current octet is neither in the US-ASCII range nor a legal first
639 * octet of a multi-octet sequence.
640 */
641 trigger_error(
642 'utf8_to_unicode: Illegal sequence identifier '.
643 'in UTF-8 at byte '.$i,
644 E_USER_WARNING
645 );
646 return false;
647
648 }
649
650 } else {
651
652 // When mState is non-zero, we expect a continuation of the multi-octet
653 // sequence
654 if (0x80 == (0xC0 & ($in))) {
655
656 // Legal continuation.
657 $shift = ($mState - 1) * 6;
658 $tmp = $in;
659 $tmp = ($tmp & 0x0000003F) << $shift;
660 $mUcs4 |= $tmp;
661
662 /**
663 * End of the multi-octet sequence. mUcs4 now contains the final
664 * Unicode codepoint to be output
665 */
666 if (0 == --$mState) {
667
668 /*
669 * Check for illegal sequences and codepoints.
670 */
671 // From Unicode 3.1, non-shortest form is illegal
672 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
673 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
674 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
675 (4 < $mBytes) ||
676 // From Unicode 3.2, surrogate characters are illegal
677 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
678 // Codepoints outside the Unicode range are illegal
679 ($mUcs4 > 0x10FFFF)) {
680
681 if($strict){
682 trigger_error(
683 'utf8_to_unicode: Illegal sequence or codepoint '.
684 'in UTF-8 at byte '.$i,
685 E_USER_WARNING
686 );
687
688 return false;
689 }
690
691 }
692
693 if (0xFEFF != $mUcs4) {
694 // BOM is legal but we don't want to output it
695 $out[] = $mUcs4;
696 }
697
698 //initialize UTF8 cache
699 $mState = 0;
700 $mUcs4 = 0;
701 $mBytes = 1;
702 }
703
704 } elseif($strict) {
705 /**
706 *((0xC0 & (*in) != 0x80) && (mState != 0))
707 * Incomplete multi-octet sequence.
708 */
709 trigger_error(
710 'utf8_to_unicode: Incomplete multi-octet '.
711 ' sequence in UTF-8 at byte '.$i,
712 E_USER_WARNING
713 );
714
715 return false;
716 }
717 }
718 }
719 return $out;
720 }
721}
722
723if(!function_exists('unicode_to_utf8')){
724 /**
725 * Takes an array of ints representing the Unicode characters and returns
726 * a UTF-8 string. Astral planes are supported ie. the ints in the
727 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
728 * are not allowed.
729 *
730 * If $strict is set to true the function returns false if the input
731 * array contains ints that represent surrogates or are outside the
732 * Unicode range and raises a PHP error at level E_USER_WARNING
733 *
734 * Note: this function has been modified slightly in this library to use
735 * output buffering to concatenate the UTF-8 string (faster) as well as
736 * reference the array by it's keys
737 *
738 * @param array of unicode code points representing a string
739 * @param boolean Check for invalid sequences?
740 * @return mixed UTF-8 string or false if array contains invalid code points
741 * @author <[email protected]>
742 * @author Harry Fuecks <[email protected]>
743 * @see utf8_to_unicode
744 * @link http://hsivonen.iki.fi/php-utf8/
745 * @link http://sourceforge.net/projects/phputf8/
746 */
747 function unicode_to_utf8($arr,$strict=false) {
748 if (!is_array($arr)) return '';
749 ob_start();
750
751 foreach (array_keys($arr) as $k) {
752
753 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
754 # ASCII range (including control chars)
755
756 echo chr($arr[$k]);
757
758 } else if ($arr[$k] <= 0x07ff) {
759 # 2 byte sequence
760
761 echo chr(0xc0 | ($arr[$k] >> 6));
762 echo chr(0x80 | ($arr[$k] & 0x003f));
763
764 } else if($arr[$k] == 0xFEFF) {
765 # Byte order mark (skip)
766
767 // nop -- zap the BOM
768
769 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
770 # Test for illegal surrogates
771
772 // found a surrogate
773 if($strict){
774 trigger_error(
775 'unicode_to_utf8: Illegal surrogate '.
776 'at index: '.$k.', value: '.$arr[$k],
777 E_USER_WARNING
778 );
779 return false;
780 }
781
782 } else if ($arr[$k] <= 0xffff) {
783 # 3 byte sequence
784
785 echo chr(0xe0 | ($arr[$k] >> 12));
786 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
787 echo chr(0x80 | ($arr[$k] & 0x003f));
788
789 } else if ($arr[$k] <= 0x10ffff) {
790 # 4 byte sequence
791
792 echo chr(0xf0 | ($arr[$k] >> 18));
793 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
794 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
795 echo chr(0x80 | ($arr[$k] & 0x3f));
796
797 } elseif($strict) {
798
799 trigger_error(
800 'unicode_to_utf8: Codepoint out of Unicode range '.
801 'at index: '.$k.', value: '.$arr[$k],
802 E_USER_WARNING
803 );
804
805 // out of range
806 return false;
807 }
808 }
809
810 $result = ob_get_contents();
811 ob_end_clean();
812 return $result;
813 }
814}
815
816if(!function_exists('utf8_to_utf16be')){
817 /**
818 * UTF-8 to UTF-16BE conversion.
819 *
820 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
821 */
822 function utf8_to_utf16be(&$str, $bom = false) {
823 $out = $bom ? "\xFE\xFF" : '';
824 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
825
826 $uni = utf8_to_unicode($str);
827 foreach($uni as $cp){
828 $out .= pack('n',$cp);
829 }
830 return $out;
831 }
832}
833
834if(!function_exists('utf16be_to_utf8')){
835 /**
836 * UTF-8 to UTF-16BE conversion.
837 *
838 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
839 */
840 function utf16be_to_utf8(&$str) {
841 $uni = unpack('n*',$str);
842 return unicode_to_utf8($uni);
843 }
844}
845
846if(!function_exists('utf8_bad_replace')){
847 /**
848 * Replace bad bytes with an alternative character
849 *
850 * ASCII character is recommended for replacement char
851 *
852 * PCRE Pattern to locate bad bytes in a UTF-8 string
853 * Comes from W3 FAQ: Multilingual Forms
854 * Note: modified to include full ASCII range including control chars
855 *
856 * @author Harry Fuecks <[email protected]>
857 * @see http://www.w3.org/International/questions/qa-forms-utf-8
858 * @param string to search
859 * @param string to replace bad bytes with (defaults to '?') - use ASCII
860 * @return string
861 */
862 function utf8_bad_replace($str, $replace = '') {
863 $UTF8_BAD =
864 '([\x00-\x7F]'. # ASCII (including control chars)
865 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
866 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
867 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
868 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
869 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
870 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
871 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
872 '|(.{1}))'; # invalid byte
873 ob_start();
874 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
875 if ( !isset($matches[2])) {
876 echo $matches[0];
877 } else {
878 echo $replace;
879 }
880 $str = substr($str,strlen($matches[0]));
881 }
882 $result = ob_get_contents();
883 ob_end_clean();
884 return $result;
885 }
886}
887
888if(!function_exists('utf8_correctIdx')){
889 /**
890 * adjust a byte index into a utf8 string to a utf8 character boundary
891 *
892 * @param $str string utf8 character string
893 * @param $i int byte index into $str
894 * @param $next bool direction to search for boundary,
895 * false = up (current character)
896 * true = down (next character)
897 *
898 * @return int byte index into $str now pointing to a utf8 character boundary
899 *
900 * @author chris smith <[email protected]>
901 */
902 function utf8_correctIdx(&$str,$i,$next=false) {
903
904 if ($i <= 0) return 0;
905
906 $limit = strlen($str);
907 if ($i>=$limit) return $limit;
908
909 if ($next) {
910 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
911 } else {
912 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
913 }
914
915 return $i;
916 }
917}
918
919// only needed if no mb_string available
920if(!UTF8_MBSTRING){
921 /**
922 * UTF-8 Case lookup table
923 *
924 * This lookuptable defines the upper case letters to their correspponding
925 * lower case letter in UTF-8
926 *
927 * @author Andreas Gohr <[email protected]>
928 */
929 global $UTF8_LOWER_TO_UPPER;
930 if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
931 ""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",
932 ""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",
933 ""=>"","
934"=>"",""=>"",""=>"",""=>"",""=>"","ῳ"=>"á¿Œ","á¿¥"=>"Ῥ","á¿¡"=>"á¿©","ῑ"=>"Ῑ",
935 "ῐ"=>"Ῐ","ῃ"=>"ῌ","៟"=>"Ι","៳"=>"៌","៱"=>"៹","៰"=>"៞","៧"=>"៯","៊"=>"៮","៥"=>"៭","ៀ"=>"៬",
936 "៣"=>"៫","២"=>"៪","១"=>"៩","ៗ"=>"៟","៖"=>"៞","៕"=>"៝","។"=>"ៜ","៓"=>"៛","្"=>"៚","៑"=>"៙",
937 "័"=>"៘","ះ"=>"៏","ំ"=>"៎","ៅ
938"=>"៍","ោ"=>"៌","ៃ"=>"់","ែ"=>"៊","េ"=>"៉","ៀ"=>"ៈ","᜜"=>"á¿»",
939 "ᜌ"=>"Ὼ","᜻"=>"á¿«","᜺"=>"Ὺ","᜹"=>"Ό","᜞"=>"á¿ž","᜷"=>"Ί","᜶"=>"Ὶ","᜵"=>"Ή","ᜎ"=>"Ὴ","ᜳ"=>"Έ",
940 "ᜲ"=>"Ὲ","ᜱ"=>"៻","ᜰ"=>"៺","ᜧ"=>"ᜯ","ᜊ"=>"ᜮ","ᜥ"=>"ᜭ","ᜀ"=>"ᜬ","ᜣ"=>"ᜫ","ᜢ"=>"ᜪ","ᜡ"=>"ᜩ",
941 "᜗"=>"ᜟ","᜕"=>"᜝","ᜓ"=>"᜛","ᜑ"=>"᜙","ᜅ
942"=>"ᜍ","ᜄ"=>"ᜌ","ᜃ"=>"ᜋ","ᜂ"=>"ᜊ","ᜁ"=>"ᜉ","ᜀ"=>"ᜈ",
943 "ጷ"=>"ጿ","ጶ"=>"ጟ","ጵ"=>"ጜ","ጎ"=>"ጌ","ጳ"=>"ጻ","ጲ"=>"ጺ","ጱ"=>"ጹ","ጰ"=>"ጞ","ጧ"=>"ጯ","ጊ"=>"ጮ",
944 "ጥ"=>"ጭ","ጀ"=>"ጬ","ጣ"=>"ጫ","ጢ"=>"ጪ","ጡ"=>"ጩ","ጕ"=>"ጝ","ጔ"=>"ጜ","ጓ"=>"ጛ","ጒ"=>"ጚ","጑"=>"ጙ",
945 "ጐ"=>"ጘ","ጇ"=>"ጏ","ጆ"=>"ጎ","ጅ
946"=>"ግ","ጄ"=>"ጌ","ጃ"=>"ጋ","ጂ"=>"ጊ","ጁ"=>"ጉ","ጀ"=>"ገ","ỹ"=>"Ở",
947 "ỷ"=>"Ỷ","ỵ"=>"Ỏ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ớ","ủ"=>"Ị","ụ"=>"Ề",
948 "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
949 "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ
950"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"ẟ","ẜ"=>"Ẍ",
951 "ẻ"=>"Ẻ","ẹ"=>"ẞ","ặ"=>"Ặ","ẵ"=>"Ẏ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"ẚ",
952 "ầ"=>"Ẋ","ấ"=>"Ẁ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"á¹ ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
953 "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ
954"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"ṟ","Ṝ"=>"Ṍ","á¹»"=>"Ṻ","á¹¹"=>"Ṟ",
955 "ṷ"=>"Ṷ","ṵ"=>"Ṏ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṛ","ṧ"=>"Ṋ","ṥ"=>"Ṁ",
956 "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
957 "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ
958"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","áž¿"=>"ស","ážœ"=>"ឌ",
959 "ុ"=>"ឺ","ឹ"=>"ឞ","ិ"=>"ា","឵"=>"ណ","ឳ"=>"ឲ","ឱ"=>"ឰ","ឯ"=>"ឮ","ឭ"=>"ឬ","ឫ"=>"ឪ","ឩ"=>"រ",
960 "ឧ"=>"ដ","ឥ"=>"ក","ឣ"=>"អ","áž¡"=>"áž ","ᾟ"=>"ᾞ","ឝ"=>"ᾜ","ᾛ"=>"ᾚ","ᾙ"=>"ម","ᾗ"=>"ᾖ","ᾕ"=>"ᾔ",
961 "ᾓ"=>"ᾒ","ᾑ"=>"ᾐ","ᾏ"=>"ᾎ","ᾍ"=>"ᾌ","ᾋ"=>"ᾊ","ᾉ"=>"ᾈ","ᾇ"=>"ᾆ","ᾅ
962"=>"ᾄ","ᾃ"=>"ᾂ","ᾁ"=>"ᾀ",
963 "ֆ"=>"Ֆ","օ
964"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","Õ¿"=>"Տ","ÕŸ"=>"Վ","Õœ"=>"Ս",
965 "ÕŒ"=>"Ռ","Õ»"=>"Ջ","Õº"=>"Պ","Õ¹"=>"Չ","Õž"=>"Ո","Õ·"=>"Շ","Õ¶"=>"Ն","Õµ"=>"Յ
966","ÕŽ"=>"Մ","Õ³"=>"Ճ",
967 "Õ²"=>"Ղ","Õ±"=>"Ձ","Õ°"=>"Հ","Õ¯"=>"Ô¿","Õ®"=>"ÔŸ","Õ­"=>"Ôœ","Õ¬"=>"ÔŒ","Õ«"=>"Ô»","Õª"=>"Ôº","Õ©"=>"Ô¹",
968 "Õš"=>"Ôž","Õ§"=>"Ô·","ÕŠ"=>"Ô¶","Õ¥"=>"Ôµ","Õ€"=>"ÔŽ","Õ£"=>"Ô³","Õ¢"=>"Ô²","Õ¡"=>"Ô±","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
969 "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ
970"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","Ó¹"=>"Óž","Óµ"=>"ÓŽ","Ó³"=>"Ó²","Ó±"=>"Ó°",
971 "Ó¯"=>"Ó®","Ó­"=>"Ó¬","Ó«"=>"Óª","Ó©"=>"Óš","Ó§"=>"ÓŠ","Ó¥"=>"Ó€","Ó£"=>"Ó¢","Ó¡"=>"Ó ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
972 "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
973 "ӆ"=>"Ӆ
974","ӄ"=>"Ӄ","ӂ"=>"Ӂ","Ò¿"=>"ÒŸ","Òœ"=>"ÒŒ","Ò»"=>"Òº","Ò¹"=>"Òž","Ò·"=>"Ò¶","Òµ"=>"ÒŽ","Ò³"=>"Ò²",
975 "Ò±"=>"Ò°","Ò¯"=>"Ò®","Ò­"=>"Ò¬","Ò«"=>"Òª","Ò©"=>"Òš","Ò§"=>"ÒŠ","Ò¥"=>"Ò€","Ò£"=>"Ò¢","Ò¡"=>"Ò ","ҟ"=>"Ҟ",
976 "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
977 "ҁ"=>"Ҁ","Ñ¿"=>"ÑŸ","Ñœ"=>"ÑŒ","Ñ»"=>"Ѻ","ѹ"=>"Ñž","Ñ·"=>"Ѷ","ѵ"=>"ÑŽ","ѳ"=>"Ѳ","ѱ"=>"Ñ°","ѯ"=>"Ñ®",
978 "Ñ­"=>"Ѭ","Ñ«"=>"Ѫ","Ñ©"=>"Ñš","ѧ"=>"ÑŠ","Ñ¥"=>"Ñ€","Ñ£"=>"Ñ¢","Ñ¡"=>"Ñ ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
979 "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ
980","є"=>"Є","ѓ"=>"Ѓ",
981 "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
982 "ш"=>"К","ч"=>"Ч","ц"=>"Њ","х
983"=>"Ð¥","ф"=>"Ѐ","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
984 "П"=>"О","Ðœ"=>"Н","ÐŒ"=>"М","л"=>"Л","к"=>"К","й"=>"Й","О"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
985 "ÐŽ"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","Ï°"=>"Κ","ϯ"=>"Ï®",
986 "Ï­"=>"Ϭ","Ï«"=>"Ϫ","Ï©"=>"Ïš","ϧ"=>"ÏŠ","Ï¥"=>"Ï€","Ï£"=>"Ï¢","Ï¡"=>"Ï ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
987 "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Ί","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
988 "ω"=>"Ω","ψ"=>"Κ","χ"=>"Χ","φ"=>"Ί","υ
989"=>"Î¥","τ"=>"΀","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
990 "ο"=>"Ο","Ο"=>"Ξ","Îœ"=>"Ν","ÎŒ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","Ξ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
991 "ε"=>"Ε","ÎŽ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Æ·",
992 "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Æ®","ʃ"=>"Æ©","ʀ"=>"ÆŠ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","É©"=>"Ɩ","Éš"=>"Ɨ",
993 "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
994 "È­"=>"Ȭ","È«"=>"Ȫ","È©"=>"Èš","ȧ"=>"ÈŠ","È¥"=>"È€","È£"=>"È¢","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
995 "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ
996"=>"Ȅ",
997 "ȃ"=>"Ȃ","ȁ"=>"Ȁ","Ç¿"=>"ÇŸ","Çœ"=>"ÇŒ","Ç»"=>"Ǻ","ǹ"=>"Çž","ǵ"=>"ÇŽ","dz"=>"Dz","ǯ"=>"Ç®","Ç­"=>"Ǭ",
998 "Ç«"=>"Ǫ","Ç©"=>"Çš","ǧ"=>"ÇŠ","Ç¥"=>"Ç€","Ç£"=>"Ç¢","Ç¡"=>"Ç ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
999 "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž
1000","Æ¿"=>"Ç·",
1001 "Ɯ"=>"ƌ","ƹ"=>"ƞ","ƶ"=>"Ƶ","Ǝ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƚ"=>"Ƨ","ƥ"=>"ƀ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1002 "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ
1003"=>"Ƅ","ƃ"=>"Ƃ","Å¿"=>"S","ÅŸ"=>"Åœ",
1004 "Ō"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŏ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ś",
1005 "ŧ"=>"ÅŠ","Å¥"=>"Å€","Å£"=>"Å¢","Å¡"=>"Å ","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1006 "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ
1007","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1008 "ğ"=>"Ĝ","Č"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ď","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1009 "Ä©"=>"Äš","ħ"=>"ÄŠ","Ä¥"=>"Ä€","Ä£"=>"Ä¢","Ä¡"=>"Ä ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1010 "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą
1011"=>"Ą","ă"=>"Ă",
1012 "ā"=>"Ā","ÿ"=>"Åž","ß"=>"Þ","Ãœ"=>"Ý","ÃŒ"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","Þ"=>"Ø","ö"=>"Ö",
1013 "õ"=>"Õ","ÃŽ"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1014 "ë"=>"Ë","ê"=>"Ê","é"=>"É","Ú"=>"È","ç"=>"Ç","Ê"=>"Æ","Ã¥"=>"Å
1015","À"=>"Ä","ã"=>"Ã","â"=>"Â",
1016 "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1017 "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1018 "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1019 );
1020
1021 /**
1022 * UTF-8 Case lookup table
1023 *
1024 * This lookuptable defines the lower case letters to their correspponding
1025 * upper case letter in UTF-8
1026 *
1027 * @author Andreas Gohr <[email protected]>
1028 */
1029 global $UTF8_UPPER_TO_LOWER;
1030 if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1031 ""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",
1032 ""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",""=>"",
1033 ""=>"",""=>"
1034",""=>"",""=>"",""=>"",""=>"","á¿Œ"=>"ῳ","Ῥ"=>"á¿¥","á¿©"=>"á¿¡","Ῑ"=>"ῑ",
1035 "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"៟","៌"=>"៳","៹"=>"៱","៞"=>"៰","៯"=>"៧","៮"=>"៊","៭"=>"៥","៬"=>"ៀ",
1036 "៫"=>"៣","៪"=>"២","៩"=>"១","៟"=>"ៗ","៞"=>"៖","៝"=>"៕","ៜ"=>"។","៛"=>"៓","៚"=>"្","៙"=>"៑",
1037 "៘"=>"័","៏"=>"ះ","៎"=>"ំ","៍"=>"ៅ
1038","៌"=>"ោ","់"=>"ៃ","៊"=>"ែ","៉"=>"េ","ៈ"=>"ៀ","á¿»"=>"᜜",
1039 "Ὼ"=>"ᜌ","á¿«"=>"᜻","Ὺ"=>"᜺","Ό"=>"᜹","á¿ž"=>"᜞","Ί"=>"᜷","Ὶ"=>"᜶","Ή"=>"᜵","Ὴ"=>"ᜎ","Έ"=>"ᜳ",
1040 "Ὲ"=>"ᜲ","៻"=>"ᜱ","៺"=>"ᜰ","ᜯ"=>"ᜧ","ᜮ"=>"ᜊ","ᜭ"=>"ᜥ","ᜬ"=>"ᜀ","ᜫ"=>"ᜣ","ᜪ"=>"ᜢ","ᜩ"=>"ᜡ",
1041 "ᜟ"=>"᜗","᜝"=>"᜕","᜛"=>"ᜓ","᜙"=>"ᜑ","ᜍ"=>"ᜅ
1042","ᜌ"=>"ᜄ","ᜋ"=>"ᜃ","ᜊ"=>"ᜂ","ᜉ"=>"ᜁ","ᜈ"=>"ᜀ",
1043 "ጿ"=>"ጷ","ጟ"=>"ጶ","ጜ"=>"ጵ","ጌ"=>"ጎ","ጻ"=>"ጳ","ጺ"=>"ጲ","ጹ"=>"ጱ","ጞ"=>"ጰ","ጯ"=>"ጧ","ጮ"=>"ጊ",
1044 "ጭ"=>"ጥ","ጬ"=>"ጀ","ጫ"=>"ጣ","ጪ"=>"ጢ","ጩ"=>"ጡ","ጝ"=>"ጕ","ጜ"=>"ጔ","ጛ"=>"ጓ","ጚ"=>"ጒ","ጙ"=>"጑",
1045 "ጘ"=>"ጐ","ጏ"=>"ጇ","ጎ"=>"ጆ","ግ"=>"ጅ
1046","ጌ"=>"ጄ","ጋ"=>"ጃ","ጊ"=>"ጂ","ጉ"=>"ጁ","ገ"=>"ጀ","Ở"=>"ỹ",
1047 "Ỷ"=>"ỷ","Ỏ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ớ"=>"ứ","Ị"=>"ủ","Ề"=>"ụ",
1048 "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1049 "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ
1050","Ể"=>"ể","Ề"=>"ề","ẟ"=>"ế","Ẍ"=>"ẜ",
1051 "Ẻ"=>"ẻ","ẞ"=>"ẹ","Ặ"=>"ặ","Ẏ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","ẚ"=>"ẩ",
1052 "Ẋ"=>"ầ","Ẁ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","á¹ "=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1053 "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ
1054","Ẃ"=>"ẃ","Ẁ"=>"ẁ","ṟ"=>"ṿ","Ṍ"=>"Ṝ","Ṻ"=>"á¹»","Ṟ"=>"á¹¹",
1055 "Ṷ"=>"ṷ","Ṏ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṛ"=>"ṩ","Ṋ"=>"ṧ","Ṁ"=>"ṥ",
1056 "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1057 "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ
1058","Ṃ"=>"ṃ","Ṁ"=>"ṁ","ស"=>"áž¿","ឌ"=>"ážœ",
1059 "ឺ"=>"ុ","ឞ"=>"ឹ","ា"=>"ិ","ណ"=>"឵","ឲ"=>"ឳ","ឰ"=>"ឱ","ឮ"=>"ឯ","ឬ"=>"ឭ","ឪ"=>"ឫ","រ"=>"ឩ",
1060 "ដ"=>"ឧ","ក"=>"ឥ","អ"=>"ឣ","áž "=>"áž¡","ᾞ"=>"ᾟ","ᾜ"=>"ឝ","ᾚ"=>"ᾛ","ម"=>"ᾙ","ᾖ"=>"ᾗ","ᾔ"=>"ᾕ",
1061 "ᾒ"=>"ᾓ","ᾐ"=>"ᾑ","ᾎ"=>"ᾏ","ᾌ"=>"ᾍ","ᾊ"=>"ᾋ","ᾈ"=>"ᾉ","ᾆ"=>"ᾇ","ᾄ"=>"ᾅ
1062","ᾂ"=>"ᾃ","ᾀ"=>"ᾁ",
1063 "Ֆ"=>"ֆ","Օ"=>"օ
1064","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"Õ¿","Վ"=>"ÕŸ","Ս"=>"Õœ",
1065 "Ռ"=>"ÕŒ","Ջ"=>"Õ»","Պ"=>"Õº","Չ"=>"Õ¹","Ո"=>"Õž","Շ"=>"Õ·","Ն"=>"Õ¶","Յ
1066"=>"Õµ","Մ"=>"ÕŽ","Ճ"=>"Õ³",
1067 "Ղ"=>"Õ²","Ձ"=>"Õ±","Հ"=>"Õ°","Ô¿"=>"Õ¯","ÔŸ"=>"Õ®","Ôœ"=>"Õ­","ÔŒ"=>"Õ¬","Ô»"=>"Õ«","Ôº"=>"Õª","Ô¹"=>"Õ©",
1068 "Ôž"=>"Õš","Ô·"=>"Õ§","Ô¶"=>"ÕŠ","Ôµ"=>"Õ¥","ÔŽ"=>"Õ€","Ô³"=>"Õ£","Ô²"=>"Õ¢","Ô±"=>"Õ¡","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1069 "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ
1070","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Óž"=>"Ó¹","ÓŽ"=>"Óµ","Ó²"=>"Ó³","Ó°"=>"Ó±",
1071 "Ó®"=>"Ó¯","Ó¬"=>"Ó­","Óª"=>"Ó«","Óš"=>"Ó©","ÓŠ"=>"Ó§","Ó€"=>"Ó¥","Ó¢"=>"Ó£","Ó "=>"Ó¡","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1072 "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1073 "Ӆ
1074"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","ÒŸ"=>"Ò¿","ÒŒ"=>"Òœ","Òº"=>"Ò»","Òž"=>"Ò¹","Ò¶"=>"Ò·","ÒŽ"=>"Òµ","Ò²"=>"Ò³",
1075 "Ò°"=>"Ò±","Ò®"=>"Ò¯","Ò¬"=>"Ò­","Òª"=>"Ò«","Òš"=>"Ò©","ÒŠ"=>"Ò§","Ò€"=>"Ò¥","Ò¢"=>"Ò£","Ò "=>"Ò¡","Ҟ"=>"ҟ",
1076 "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1077 "Ҁ"=>"ҁ","ÑŸ"=>"Ñ¿","ÑŒ"=>"Ñœ","Ѻ"=>"Ñ»","Ñž"=>"ѹ","Ѷ"=>"Ñ·","ÑŽ"=>"ѵ","Ѳ"=>"ѳ","Ñ°"=>"ѱ","Ñ®"=>"ѯ",
1078 "Ѭ"=>"Ñ­","Ѫ"=>"Ñ«","Ñš"=>"Ñ©","ÑŠ"=>"ѧ","Ñ€"=>"Ñ¥","Ñ¢"=>"Ñ£","Ñ "=>"Ñ¡","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1079 "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ
1080"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1081 "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1082 "К"=>"ш","Ч"=>"ч","Њ"=>"ц","Ð¥"=>"х
1083","Ѐ"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1084 "О"=>"П","Н"=>"Ðœ","М"=>"ÐŒ","Л"=>"л","К"=>"к","Й"=>"й","И"=>"О","З"=>"з","Ж"=>"ж","Е"=>"е",
1085 "Д"=>"ÐŽ","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"Ï°","Ï®"=>"ϯ",
1086 "Ϭ"=>"Ï­","Ϫ"=>"Ï«","Ïš"=>"Ï©","ÏŠ"=>"ϧ","Ï€"=>"Ï¥","Ï¢"=>"Ï£","Ï "=>"Ï¡","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1087 "Ϙ"=>"ϙ","Π"=>"ϖ","Ί"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1088 "Ω"=>"ω","Κ"=>"ψ","Χ"=>"χ","Ί"=>"φ","Î¥"=>"υ
1089","΀"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1090 "Ο"=>"ο","Ξ"=>"Ο","Ν"=>"Îœ","Μ"=>"ÎŒ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"Ξ","Η"=>"η","Ζ"=>"ζ",
1091 "Ε"=>"ε","Δ"=>"ÎŽ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Æ·"=>"ʒ",
1092 "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Æ®"=>"ʈ","Æ©"=>"ʃ","ÆŠ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"É©","Ɨ"=>"Éš",
1093 "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1094 "Ȭ"=>"È­","Ȫ"=>"È«","Èš"=>"È©","ÈŠ"=>"ȧ","È€"=>"È¥","È¢"=>"È£","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1095 "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ
1096",
1097 "Ȃ"=>"ȃ","Ȁ"=>"ȁ","ÇŸ"=>"Ç¿","ÇŒ"=>"Çœ","Ǻ"=>"Ç»","Çž"=>"ǹ","ÇŽ"=>"ǵ","Dz"=>"dz","Ç®"=>"ǯ","Ǭ"=>"Ç­",
1098 "Ǫ"=>"Ç«","Çš"=>"Ç©","ÇŠ"=>"ǧ","Ç€"=>"Ç¥","Ç¢"=>"Ç£","Ç "=>"Ç¡","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1099 "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž
1100"=>"dž","Ƿ"=>"ƿ",
1101 "ƌ"=>"Ɯ","ƞ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"Ǝ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƚ","ƀ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1102 "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ
1103","Ƃ"=>"ƃ","S"=>"Å¿","Åœ"=>"ÅŸ",
1104 "Ż"=>"Ō","Ź"=>"ź","Ŷ"=>"ŷ","Ŏ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ś"=>"ũ",
1105 "ÅŠ"=>"ŧ","Å€"=>"Å¥","Å¢"=>"Å£","Å "=>"Å¡","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1106 "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ
1107"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1108 "Ĝ"=>"ğ","Ļ"=>"Č","Ĺ"=>"ĺ","Ķ"=>"ķ","Ď"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1109 "Äš"=>"Ä©","ÄŠ"=>"ħ","Ä€"=>"Ä¥","Ä¢"=>"Ä£","Ä "=>"Ä¡","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1110 "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą
1111","Ă"=>"ă",
1112 "Ā"=>"ā","Åž"=>"ÿ","Þ"=>"ß","Ý"=>"Ãœ","Ü"=>"ÃŒ","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"Þ","Ö"=>"ö",
1113 "Õ"=>"õ","Ô"=>"ÃŽ","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1114 "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"Ú","Ç"=>"ç","Æ"=>"Ê","Å
1115"=>"Ã¥","Ä"=>"À","Ã"=>"ã","Â"=>"â",
1116 "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1117 "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1118 "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1119 );
1120}; // end of case lookup tables
1121
1122/**
1123 * UTF-8 lookup table for lower case accented letters
1124 *
1125 * This lookuptable defines replacements for accented characters from the ASCII-7
1126 * range. This are lower case letters only.
1127 *
1128 * @author Andreas Gohr <[email protected]>
1129 * @see utf8_deaccent()
1130 */
1131global $UTF8_LOWER_ACCENTS;
1132if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1133 'à' => 'a', 'ÃŽ' => 'o', 'ď' => 'd', 'ᾟ' => 'f', 'ë' => 'e', 'Å¡' => 's', 'Æ¡' => 'o',
1134 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1135 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1136 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1137 'ṡ' => 's', 'Þ' => 'o', 'Ä£' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1138 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1139 'Å«' => 'u', 'č' => 'c', 'ö' => 'oe', 'Ú' => 'e', 'Å·' => 'y', 'ą
1140' => 'a', 'ł' => 'l',
1141 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ÄŒ' => 'l', 'ƒ' => 'f', 'ÅŸ' => 'z',
1142 'ẃ' => 'w', 'ឃ' => 'b', 'Ã¥' => 'a', 'ì' => 'i', 'ï' => 'i', 'ᾋ' => 'd', 'Å¥' => 't',
1143 'ŗ' => 'r', 'À' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ÃŒ' => 'ue', 'ò' => 'o',
1144 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1145 'ÿ' => 'y', 'Å©' => 'u', 'Å­' => 'u', 'Æ°' => 'u', 'Å£' => 't', 'Ãœ' => 'y', 'ő' => 'o',
1146 'â' => 'a', 'ÄŸ' => 'l', 'ẅ
1147' => 'w', 'Ō' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1148 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1149 'û' => 'u', 'ß' => 'th', 'ð' => 'dh', 'Ê' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1150);
1151
1152/**
1153 * UTF-8 lookup table for upper case accented letters
1154 *
1155 * This lookuptable defines replacements for accented characters from the ASCII-7
1156 * range. This are upper case letters only.
1157 *
1158 * @author Andreas Gohr <[email protected]>
1159 * @see utf8_deaccent()
1160 */
1161global $UTF8_UPPER_ACCENTS;
1162if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1163 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'ᾞ' => 'F', 'Ë' => 'E', 'Å ' => 'S', 'Æ ' => 'O',
1164 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1165 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ
1166' => 'N', 'Ĺ' => 'L', 'ÄŠ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1167 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1168 'á¹ ' => 'S', 'Ø' => 'O', 'Ä¢' => 'G', 'ÅŠ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1169 'Ś' => 'S', 'Î' => 'I', 'Å°' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'ÅŽ' => 'W', 'Ṫ' => 'T',
1170 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1171 'Ų' => 'U', 'Å®' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ä»' => 'L', 'Ƒ' => 'F', 'Åœ' => 'Z',
1172 'Ẃ' => 'W', 'ᾂ' => 'B', 'Å
1173' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'ᾊ' => 'D', 'Å€' => 'T',
1174 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1175 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ä€' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'ÄŽ' => 'J',
1176 'Ş' => 'Y', 'Ś' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1177 'Â' => 'A', 'Äœ' => 'L', 'Ẅ' => 'W', 'Å»' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ä ' => 'G',
1178 'Ṁ' => 'M', 'Ō' => 'O', 'Äš' => 'I', 'Ù' => 'U', 'Ä®' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1179 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1180);
1181
1182/**
1183 * UTF-8 array of common special characters
1184 *
1185 * This array should contain all special characters (not a letter or digit)
1186 * defined in the various local charsets - it's not a complete list of non-alphanum
1187 * characters in UTF-8. It's not perfect but should match most cases of special
1188 * chars.
1189 *
1190 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1191 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1192 *
1193 * @author Andreas Gohr <[email protected]>
1194 * @see utf8_stripspecials()
1195 */
1196global $UTF8_SPECIAL_CHARS;
1197if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1198 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1199 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c,
1200 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1201 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1202 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1203 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1204 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1205 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1206 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1207 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1208 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1209 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1210 0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1211 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1212 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1213 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1214 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1215 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1216 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1217 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1218 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1219 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1220 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1221 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1222 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1223 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1224 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1225 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1226 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1227 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1228 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1229 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1230 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1231 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1232 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1233 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1234 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1235 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1236 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1237 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1238 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1239 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1240 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1241 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1242 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1243 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1244 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1245 0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1246 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1247 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1248 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1249 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1250 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1251 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1252 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1253 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1254 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1255 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1256 0xffeb, 0xffec, 0xffed, 0xffee,
1257 0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1258 0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1259 0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1260 0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1261 0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1262);
1263
1264// utf8 version of above data
1265global $UTF8_SPECIAL_CHARS2;
1266if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1267 "\x1A".'
1268
1269
1270 !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…
1271†‡ˆ‰Š‹ŒŽ‘’“”•ᅵ'.
1272 'ᅵ—˜™š›œžŸ ¡¢£€¥Š§š©ª«¬­®¯°±²³Žµ¶·ž¹º»Œœᅵ'.
1273 'ᅵ¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅
1274·ϖְֱֲֳ֎ֵֶַֹֻ֞֌ֿ֜֟ᅵ'.
1275 'ï¿œ××‚×ƒ×³×ŽØŒØ›ØŸÙ€Ù‹ÙŒÙÙŽÙÙÙ‘Ù’Ùªàž¿â€Œâ€â€Žâ€â€“â€”â€•â€—â€˜â€™â€šâ€œâ€ï¿œ'.
1276 'ᅵᅵ†‡• ‰′″‹›⁄₧₪₫€№℘™ℊℵ←↑→↓↔↕↵'.
1277 '⇐⇑⇒⇓⇔∀∂∃∅
1278∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧√ᅵ'.
1279 'ᅵ∪∫∎∌≅
1280≈≠≡≀≥⊂⊃⊄⊆⊇⊕⊗⊥⋅
1281⌐⌠⌡〈〉⑩─ᅵ'.
1282 'ᅵᅵ┌┐└┘├─┬┎┌═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1283 '╡╢╣╀╥╊╧╚╩╪╫╬▀▄█▌▐░▒▓■▲▌◆◊●ᅵ'.
1284 'ᅵ★
1285☎☛☞♠♣♥♊✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕ᅵ'.
1286 'ᅵᅵ✗✘✙✚✛✜✝✞✟✠✡✢✣✀✥✊✧✩✪✫✬✭✮✯✰✱'.
1287 '✲✳✎✵✶✷✞✹✺✻✌✜✟✿❀❁❂❃❄❅
1288❆❇❈❉❊❋ᅵ'.
1289 'ᅵ❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❀❥❊❧❿➉➓➔➘➙➚ᅵ'.
1290 'ᅵᅵ➜➝➞➟➠➡➢➣➀➥➊➧➚➩➪➫➬➭➮➯➱➲➳➎➵➶'.
1291 '➷➞➹➺➻➌➜➟'.
1292 ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1293 'ᅵ'.
1294 'ᅵ﹌﹜'.
1295 '
1296'.
1297 '¢£¬ ̄￀¥ᅧᅳ←↑→↓■○'.
1298 '𝛌𝛜𝛟𝛿𝜀𝜁𝜂𝜃𝜄𝜅
1299𝜆𝜇𝜈𝜉𝜊𝜋𝜌𝜍𝜎𝜏𝜐𝜑𝜒𝜓𝜔𝜕𝜖𝜗𝜘𝜙𝜚𝜛'.
1300 '   ⁠';
1301
1302/**
1303 * Romanization lookup table
1304 *
1305 * This lookup tables provides a way to transform strings written in a language
1306 * different from the ones based upon latin letters into plain ASCII.
1307 *
1308 * Please note: this is not a scientific transliteration table. It only works
1309 * oneway from nonlatin to ASCII and it works by simple character replacement
1310 * only. Specialities of each language are not supported.
1311 *
1312 * @author Andreas Gohr <[email protected]>
1313 * @author Vitaly Blokhin <[email protected]>
1314 * @link http://www.uconv.com/translit.htm
1315 * @author Bisqwit <[email protected]>
1316 * @link http://kanjidict.stc.cx/hiragana.php?src=2
1317 * @link http://www.translatum.gr/converter/greek-transliteration.htm
1318 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1319 * @link http://www.btranslations.com/resources/romanization/korean.asp
1320 * @author Arthit Suriyawongkul <[email protected]>
1321 * @author Denis Scheither <[email protected]>
1322 */
1323global $UTF8_ROMANIZATION;
1324if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1325 // scandinavian - differs from what we do in deaccent
1326 'å'=>'a','Å
1327'=>'A','À'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1328
1329 //russian cyrillic
1330 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1331 'ÐŽ'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1332 'з'=>'z','З'=>'Z','О'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1333 'л'=>'l','Л'=>'L','ÐŒ'=>'m','М'=>'M','Ðœ'=>'n','Н'=>'N','П'=>'o','О'=>'O',
1334 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1335 'у'=>'u','У'=>'U','ф'=>'f','Ѐ'=>'F','х
1336'=>'x','Ð¥'=>'X','ц'=>'c','Њ'=>'C',
1337 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','К'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1338 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1339 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1340 // Ukrainian cyrillic
1341 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1342 // Georgian
1343 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1344 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1345 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','Ⴠ'=>'ph','ქ'=>'kh','჊'=>'gh','ყ'=>'q',
1346 'ლ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1347 'ჰ'=>'xh',
1348 //Sanskrit
1349 'à€…
1350'=>'a','à€†'=>'ah','à€‡'=>'i','à€ˆ'=>'ih','à€‰'=>'u','à€Š'=>'uh','à€‹'=>'ry',
1351 'ॠ'=>'ryh','à€Œ'=>'ly','ॡ'=>'lyh','à€'=>'e','à€'=>'ay','à€“'=>'o','à€”'=>'aw',
1352 'à€…
1353à€‚'=>'amh','à€…
1354à€ƒ'=>'aq','à€•'=>'k','à€–'=>'kh','à€—'=>'g','à€˜'=>'gh','à€™'=>'nh',
1355 'à€š'=>'c','à€›'=>'ch','à€œ'=>'j','à€'=>'jh','à€ž'=>'ny','à€Ÿ'=>'tq','à€ '=>'tqh',
1356 'à€¡'=>'dq','à€¢'=>'dqh','à€£'=>'nq','à€€'=>'t','à€¥'=>'th','à€Š'=>'d','à€§'=>'dh',
1357 'à€š'=>'n','à€ª'=>'p','à€«'=>'ph','à€¬'=>'b','à€­'=>'bh','à€®'=>'m','à€¯'=>'z','à€°'=>'r',
1358 'à€²'=>'l','à€µ'=>'v','à€¶'=>'sh','à€·'=>'sqh','à€ž'=>'s','à€¹'=>'x',
1359 //Hebrew
1360 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1361 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1362 'ס'=>'s','ע'=>'ah','ף'=>'f','׀'=>'p','ץ'=>'c','׊'=>'c','ק'=>'q','ך'=>'r',
1363 'ש'=>'sh','ת'=>'t',
1364 //Arabic
1365 'ا'=>'a','ؚ'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1366 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','؎'=>'sh','ص'=>'s\'','ض'=>'d\'',
1367 'Ø·'=>'t\'','Øž'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1368 'ل'=>'l','م
1369'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1370
1371 // Japanese characters (last update: 2008-05-09)
1372
1373 // Japanese hiragana
1374
1375 // 3 character syllables, っ doubles the consonant after
1376 'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ
1377'=>'cchu',
1378 'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ
1379'=>'bbyu',
1380 'っぎゃ'=>'ppya','っぎぇ'=>'ppye','っぎぃ'=>'ppyi','っぎょ'=>'ppyo','っぎゅ
1381'=>'ppyu',
1382 'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ
1383'=>'cchu',
1384 // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ
1385'=>'hyu',
1386 'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ
1387'=>'kkyu',
1388 'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ
1389'=>'ggyu',
1390 'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ
1391'=>'mmyu',
1392 'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ
1393'=>'nnyu',
1394 'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ
1395'=>'rryu',
1396 'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ
1397'=>'sshu',
1398
1399 // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1400 'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1401 'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1402
1403 // 2 character syllables - normal
1404 'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1405 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ
1406'=>'chu',
1407 'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ
1408'=>'hyu',
1409 'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ
1410'=>'byu',
1411 'ぎゃ'=>'pya','ぎぇ'=>'pye','ぎぃ'=>'pyi','ぎょ'=>'pyo','ぎゅ
1412'=>'pyu',
1413 'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ
1414'=>'kyu',
1415 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ
1416'=>'gyu',
1417 'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ
1418'=>'myu',
1419 'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ
1420'=>'nyu',
1421 'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ
1422'=>'ryu',
1423 'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ
1424'=>'shu',
1425 'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ
1426'=>'ju',
1427 'うぇ'=>'we','うぃ'=>'wi',
1428 'いぇ'=>'ye',
1429
1430 // 2 character syllables, っ doubles the consonant after
1431 'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っが'=>'bbo','っぶ'=>'bbu',
1432 'っぱ'=>'ppa','っぺ'=>'ppe','っぎ'=>'ppi','っぜ'=>'ppo','っぷ'=>'ppu',
1433 'った'=>'tta','っお'=>'tte','っち'=>'cchi','っず'=>'tto','っ぀'=>'ttsu',
1434 'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1435 'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1436 'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1437 'った'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1438 'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1439 'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1440 'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1441 'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1442
1443 // 1 character syllabels
1444 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1445 'は'=>'ha','ぞ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1446 'ば'=>'ba','べ'=>'be','び'=>'bi','が'=>'bo','ぶ'=>'bu',
1447 'ぱ'=>'pa','ぺ'=>'pe','ぎ'=>'pi','ぜ'=>'po','ぷ'=>'pu',
1448 'た'=>'ta','お'=>'te','ち'=>'chi','ず'=>'to','぀'=>'tsu',
1449 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1450 'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1451 'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1452 'た'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1453 'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1454 'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1455 'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1456 'わ'=>'wa','を'=>'wo',
1457 'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1458 'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1459 // old characters
1460 'ゑ'=>'we','ゐ'=>'wi',
1461
1462 // convert what's left (probably only kicks in when something's missing above)
1463 // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ
1464'=>'u',
1465 // 'ゃ'=>'ya','ょ'=>'yo','ゅ
1466'=>'yu',
1467
1468 // never seen one of those (disabled for the moment)
1469 // 'ノぁ'=>'va','ノぇ'=>'ve','ノぃ'=>'vi','ノぉ'=>'vo','ノ'=>'vu',
1470 // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ
1471'=>'dhu',
1472 // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ
1473'=>'dwu',
1474 // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ
1475'=>'dyu',
1476 // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ
1477'=>'fwu',
1478 // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ
1479'=>'fyu',
1480 // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ
1481'=>'swu',
1482 // 'おゃ'=>'tha','おぇ'=>'the','おぃ'=>'thi','おょ'=>'tho','おゅ
1483'=>'thu',
1484 // '぀ゃ'=>'tsa','぀ぇ'=>'tse','぀ぃ'=>'tsi','぀ょ'=>'tso','぀'=>'tsu',
1485 // 'ずぁ'=>'twa','ずぇ'=>'twe','ずぃ'=>'twi','ずぉ'=>'two','ずぅ
1486'=>'twu',
1487 // 'ノゃ'=>'vya','ノぇ'=>'vye','ノぃ'=>'vyi','ノょ'=>'vyo','ノゅ
1488'=>'vyu',
1489 // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ
1490'=>'whu',
1491 // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ
1492'=>'zhu',
1493 // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ
1494'=>'zyu',
1495
1496 // 'spare' characters from other romanization systems
1497 // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1498 // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1499 // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1500 // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ
1501'=>'cyu',
1502 //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ
1503'=>'jyu',
1504 //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ
1505'=>'lyu',
1506 //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ
1507'=>'syu',
1508 //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ
1509'=>'tyu',
1510 //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1511 //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ
1512'=>'jju',
1513
1514
1515 // Japanese katakana
1516
1517 // 4 character syllables: ッ doubles the consonant after, ヌ doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1518 'ッビャヌ'=>'bbyaa','ッビェヌ'=>'bbyee','ッビィヌ'=>'bbyii','ッビョヌ'=>'bbyoo','ッビュヌ'=>'bbyuu',
1519 'ッピャヌ'=>'ppyaa','ッピェヌ'=>'ppyee','ッピィヌ'=>'ppyii','ッピョヌ'=>'ppyoo','ッピュヌ'=>'ppyuu',
1520 'ッキャヌ'=>'kkyaa','ッキェヌ'=>'kkyee','ッキィヌ'=>'kkyii','ッキョヌ'=>'kkyoo','ッキュヌ'=>'kkyuu',
1521 'ッギャヌ'=>'ggyaa','ッギェヌ'=>'ggyee','ッギィヌ'=>'ggyii','ッギョヌ'=>'ggyoo','ッギュヌ'=>'ggyuu',
1522 'ッミャヌ'=>'mmyaa','ッミェヌ'=>'mmyee','ッミィヌ'=>'mmyii','ッミョヌ'=>'mmyoo','ッミュヌ'=>'mmyuu',
1523 'ッニャヌ'=>'nnyaa','ッニェヌ'=>'nnyee','ッニィヌ'=>'nnyii','ッニョヌ'=>'nnyoo','ッニュヌ'=>'nnyuu',
1524 'ッリャヌ'=>'rryaa','ッリェヌ'=>'rryee','ッリィヌ'=>'rryii','ッリョヌ'=>'rryoo','ッリュヌ'=>'rryuu',
1525 'ッシャヌ'=>'sshaa','ッシェヌ'=>'sshee','ッシヌ'=>'sshii','ッショヌ'=>'sshoo','ッシュヌ'=>'sshuu',
1526 'ッチャヌ'=>'cchaa','ッチェヌ'=>'cchee','ッチヌ'=>'cchii','ッチョヌ'=>'cchoo','ッチュヌ'=>'cchuu',
1527 'ッティヌ'=>'ttii',
1528 'ッヂィヌ'=>'ddii',
1529
1530 // 3 character syllables - doubled vowels
1531 'ファヌ'=>'faa','フェヌ'=>'fee','フィヌ'=>'fii','フォヌ'=>'foo',
1532 'フャヌ'=>'fyaa','フェヌ'=>'fyee','フィヌ'=>'fyii','フョヌ'=>'fyoo','フュヌ'=>'fyuu',
1533 'ヒャヌ'=>'hyaa','ヒェヌ'=>'hyee','ヒィヌ'=>'hyii','ヒョヌ'=>'hyoo','ヒュヌ'=>'hyuu',
1534 'ビャヌ'=>'byaa','ビェヌ'=>'byee','ビィヌ'=>'byii','ビョヌ'=>'byoo','ビュヌ'=>'byuu',
1535 'ピャヌ'=>'pyaa','ピェヌ'=>'pyee','ピィヌ'=>'pyii','ピョヌ'=>'pyoo','ピュヌ'=>'pyuu',
1536 'キャヌ'=>'kyaa','キェヌ'=>'kyee','キィヌ'=>'kyii','キョヌ'=>'kyoo','キュヌ'=>'kyuu',
1537 'ギャヌ'=>'gyaa','ギェヌ'=>'gyee','ギィヌ'=>'gyii','ギョヌ'=>'gyoo','ギュヌ'=>'gyuu',
1538 'ミャヌ'=>'myaa','ミェヌ'=>'myee','ミィヌ'=>'myii','ミョヌ'=>'myoo','ミュヌ'=>'myuu',
1539 'ニャヌ'=>'nyaa','ニェヌ'=>'nyee','ニィヌ'=>'nyii','ニョヌ'=>'nyoo','ニュヌ'=>'nyuu',
1540 'リャヌ'=>'ryaa','リェヌ'=>'ryee','リィヌ'=>'ryii','リョヌ'=>'ryoo','リュヌ'=>'ryuu',
1541 'シャヌ'=>'shaa','シェヌ'=>'shee','シヌ'=>'shii','ショヌ'=>'shoo','シュヌ'=>'shuu',
1542 'ゞャヌ'=>'jaa','ゞェヌ'=>'jee','ゞヌ'=>'jii','ゞョヌ'=>'joo','ゞュヌ'=>'juu',
1543 'スァヌ'=>'swaa','スェヌ'=>'swee','スィヌ'=>'swii','スォヌ'=>'swoo','スゥヌ'=>'swuu',
1544 'デァヌ'=>'daa','デェヌ'=>'dee','ディヌ'=>'dii','デォヌ'=>'doo','デゥヌ'=>'duu',
1545 'チャヌ'=>'chaa','チェヌ'=>'chee','チヌ'=>'chii','チョヌ'=>'choo','チュヌ'=>'chuu',
1546 'ヂャヌ'=>'dyaa','ヂェヌ'=>'dyee','ヂィヌ'=>'dyii','ヂョヌ'=>'dyoo','ヂュヌ'=>'dyuu',
1547 'ツャヌ'=>'tsaa','ツェヌ'=>'tsee','ツィヌ'=>'tsii','ツョヌ'=>'tsoo','ツヌ'=>'tsuu',
1548 'トァヌ'=>'twaa','トェヌ'=>'twee','トィヌ'=>'twii','トォヌ'=>'twoo','トゥヌ'=>'twuu',
1549 'ドァヌ'=>'dwaa','ドェヌ'=>'dwee','ドィヌ'=>'dwii','ドォヌ'=>'dwoo','ドゥヌ'=>'dwuu',
1550 'りァヌ'=>'whaa','りェヌ'=>'whee','りィヌ'=>'whii','りォヌ'=>'whoo','りゥヌ'=>'whuu',
1551 'ノャヌ'=>'vyaa','ノェヌ'=>'vyee','ノィヌ'=>'vyii','ノョヌ'=>'vyoo','ノュヌ'=>'vyuu',
1552 'ノァヌ'=>'vaa','ノェヌ'=>'vee','ノィヌ'=>'vii','ノォヌ'=>'voo','ノヌ'=>'vuu',
1553 'りェヌ'=>'wee','りィヌ'=>'wii',
1554 'むェヌ'=>'yee',
1555 'ティヌ'=>'tii',
1556 'ヂィヌ'=>'dii',
1557
1558 // 3 character syllables - doubled consonants
1559 'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1560 'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1561 'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1562 'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1563 'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1564 'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1565 'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1566 'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1567 'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1568 'ッティ'=>'tti',
1569 'ッヂィ'=>'ddi',
1570
1571 // 3 character syllables - doubled vowel and consonants
1572 'ッバヌ'=>'bbaa','ッベヌ'=>'bbee','ッビヌ'=>'bbii','ッボヌ'=>'bboo','ッブヌ'=>'bbuu',
1573 'ッパヌ'=>'ppaa','ッペヌ'=>'ppee','ッピヌ'=>'ppii','ッポヌ'=>'ppoo','ップヌ'=>'ppuu',
1574 'ッケヌ'=>'kkee','ッキヌ'=>'kkii','ッコヌ'=>'kkoo','ックヌ'=>'kkuu','ッカヌ'=>'kkaa',
1575 'ッガヌ'=>'ggaa','ッゲヌ'=>'ggee','ッギヌ'=>'ggii','ッゎヌ'=>'ggoo','ッグヌ'=>'gguu',
1576 'ッマヌ'=>'maa','ッメヌ'=>'mee','ッミヌ'=>'mii','ッモヌ'=>'moo','ッムヌ'=>'muu',
1577 'ッナヌ'=>'nnaa','ッネヌ'=>'nnee','ッニヌ'=>'nnii','ッノヌ'=>'nnoo','ッヌヌ'=>'nnuu',
1578 'ッラヌ'=>'rraa','ッレヌ'=>'rree','ッリヌ'=>'rrii','ッロヌ'=>'rroo','ッルヌ'=>'rruu',
1579 'ッサヌ'=>'ssaa','ッセヌ'=>'ssee','ッシヌ'=>'sshii','ッ゜ヌ'=>'ssoo','ッスヌ'=>'ssuu',
1580 'ッザヌ'=>'zzaa','ッれヌ'=>'zzee','ッゞヌ'=>'jjii','ッゟヌ'=>'zzoo','ッズヌ'=>'zzuu',
1581 'ッタヌ'=>'ttaa','ッテヌ'=>'ttee','ッチヌ'=>'chii','ットヌ'=>'ttoo','ッツヌ'=>'ttsuu',
1582 'ッダヌ'=>'ddaa','ッデヌ'=>'ddee','ッヂヌ'=>'ddii','ッドヌ'=>'ddoo','ッヅ
1583ヌ'=>'dduu',
1584
1585 // 2 character syllables - normal
1586 'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1587 // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1588 'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1589 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1590 'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1591 'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1592 'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1593 'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1594 'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1595 'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1596 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1597 'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1598 'ゞャ'=>'ja','ゞェ'=>'je','ゞョ'=>'jo','ゞュ'=>'ju',
1599 'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1600 'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1601 'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1602 // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1603 'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1604 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1605 'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1606 'ã‚Šã‚¡'=>'wha','りェ'=>'whe','ã‚Šã‚£'=>'whi','ã‚Šã‚©'=>'who','ã‚Šã‚¥'=>'whu',
1607 'ノャ'=>'vya','ノェ'=>'vye','ノィ'=>'vyi','ノョ'=>'vyo','ノュ'=>'vyu',
1608 'ノァ'=>'va','ノェ'=>'ve','ノィ'=>'vi','ノォ'=>'vo','ノ'=>'vu',
1609 'りェ'=>'we','ã‚Šã‚£'=>'wi',
1610 'むェ'=>'ye',
1611 'ティ'=>'ti',
1612 'ヂィ'=>'di',
1613
1614 // 2 character syllables - doubled vocal
1615 'アヌ'=>'aa','゚ヌ'=>'ee','むヌ'=>'ii','オヌ'=>'oo','りヌ'=>'uu',
1616 'ダヌ'=>'daa','デヌ'=>'dee','ヂヌ'=>'dii','ドヌ'=>'doo','ヅ
1617ヌ'=>'duu',
1618 'ハヌ'=>'haa','ヘヌ'=>'hee','ヒヌ'=>'hii','ホヌ'=>'hoo','フヌ'=>'fuu',
1619 'バヌ'=>'baa','ベヌ'=>'bee','ビヌ'=>'bii','ボヌ'=>'boo','ブヌ'=>'buu',
1620 'パヌ'=>'paa','ペヌ'=>'pee','ピヌ'=>'pii','ポヌ'=>'poo','プヌ'=>'puu',
1621 'ケヌ'=>'kee','キヌ'=>'kii','コヌ'=>'koo','クヌ'=>'kuu','カヌ'=>'kaa',
1622 'ガヌ'=>'gaa','ゲヌ'=>'gee','ギヌ'=>'gii','ゎヌ'=>'goo','グヌ'=>'guu',
1623 'マヌ'=>'maa','メヌ'=>'mee','ミヌ'=>'mii','モヌ'=>'moo','ムヌ'=>'muu',
1624 'ナヌ'=>'naa','ネヌ'=>'nee','ニヌ'=>'nii','ノヌ'=>'noo','ヌヌ'=>'nuu',
1625 'ラヌ'=>'raa','レヌ'=>'ree','リヌ'=>'rii','ロヌ'=>'roo','ルヌ'=>'ruu',
1626 'サヌ'=>'saa','セヌ'=>'see','シヌ'=>'shii','゜ヌ'=>'soo','スヌ'=>'suu',
1627 'ザヌ'=>'zaa','れヌ'=>'zee','ゞヌ'=>'jii','ゟヌ'=>'zoo','ズヌ'=>'zuu',
1628 'タヌ'=>'taa','テヌ'=>'tee','チヌ'=>'chii','トヌ'=>'too','ツヌ'=>'tsuu',
1629 'ワヌ'=>'waa','ヲヌ'=>'woo',
1630 'ダヌ'=>'yaa','ペヌ'=>'yoo','ナヌ'=>'yuu',
1631 'ヵヌ'=>'kaa','ヶヌ'=>'kee',
1632 // old characters
1633 'ヱヌ'=>'wee','ヰヌ'=>'wii',
1634
1635 // seperate katakana 'n'
1636 'ンア'=>'n_a','ン゚'=>'n_e','ンむ'=>'n_i','ンオ'=>'n_o','ンり'=>'n_u',
1637 'ンダ'=>'n_ya','ンペ'=>'n_yo','ンナ'=>'n_yu',
1638
1639 // 2 character syllables - doubled consonants
1640 'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1641 'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1642 'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1643 'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゎ'=>'ggo','ッグ'=>'ggu',
1644 'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1645 'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1646 'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1647 'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッ゜'=>'sso','ッス'=>'ssu',
1648 'ッザ'=>'zza','ッれ'=>'zze','ッゞ'=>'jji','ッゟ'=>'zzo','ッズ'=>'zzu',
1649 'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1650 'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ
1651'=>'ddu',
1652
1653 // 1 character syllables
1654 'ア'=>'a','ã‚š'=>'e','ã‚€'=>'i','オ'=>'o','ã‚Š'=>'u','ン'=>'n',
1655 'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1656 'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1657 'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1658 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1659 'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ã‚Ž'=>'go','グ'=>'gu',
1660 'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1661 'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1662 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1663 'サ'=>'sa','セ'=>'se','シ'=>'shi','ã‚œ'=>'so','ス'=>'su',
1664 'ザ'=>'za','ã‚Œ'=>'ze','ã‚ž'=>'ji','ã‚Ÿ'=>'zo','ズ'=>'zu',
1665 'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1666 'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ
1667'=>'du',
1668 'ワ'=>'wa','ヲ'=>'wo',
1669 'ダ'=>'ya','ペ'=>'yo','ナ'=>'yu',
1670 'ヵ'=>'ka','ヶ'=>'ke',
1671 // old characters
1672 'ヱ'=>'we','ヰ'=>'wi',
1673
1674 // convert what's left (probably only kicks in when something's missing above)
1675 'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1676 'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1677
1678 // special characters
1679 '・'=>'_','、'=>'_',
1680 'ヌ'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1681
1682 // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1683 // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1684 //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1685 // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1686 // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1687 //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1688 //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1689 // 'ゞャ'=>'jya','ゞェ'=>'jye','ã‚žã‚£'=>'jyi','ゞョ'=>'jyo','ゞュ'=>'jyu',
1690 // 'ゞャ'=>'zha','ゞェ'=>'zhe','ã‚žã‚£'=>'zhi','ゞョ'=>'zho','ゞュ'=>'zhu',
1691 //'ゞャ'=>'zya','ゞェ'=>'zye','ã‚žã‚£'=>'zyi','ゞョ'=>'zyo','ゞュ'=>'zyu',
1692 //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1693 //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','ã‚€'=>'yi','ヂ'=>'dzi',
1694
1695 // "Greeklish"
1696 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Ί'=>'F','Κ'=>'Ps',
1697 'γ'=>'g','ÎŽ'=>'e','Ξ'=>'th','λ'=>'l','Ο'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1698
1699 // Thai
1700 'àž'=>'k','àž‚'=>'kh','àžƒ'=>'kh','àž„'=>'kh','àž…
1701'=>'kh','àž†'=>'kh','àž‡'=>'ng','àžˆ'=>'ch',
1702 'àž‰'=>'ch','àžŠ'=>'ch','àž‹'=>'s','àžŒ'=>'ch','àž'=>'y','àžŽ'=>'d','àž'=>'t','àž'=>'th',
1703 'àž‘'=>'d','àž’'=>'th','àž“'=>'n','àž”'=>'d','àž•'=>'t','àž–'=>'th','àž—'=>'th','àž˜'=>'th',
1704 'àž™'=>'n','àžš'=>'b','àž›'=>'p','àžœ'=>'ph','àž'=>'f','àžž'=>'ph','àžŸ'=>'f','àž '=>'ph',
1705 'àž¡'=>'m','àž¢'=>'y','àž£'=>'r','àž€'=>'rue','àž€à¹…
1706'=>'rue','àž¥'=>'l','àžŠ'=>'lue',
1707 'àžŠà¹…
1708'=>'lue','àž§'=>'w','àžš'=>'s','àž©'=>'s','àžª'=>'s','àž«'=>'h','àž¬'=>'l','àž®'=>'h',
1709 'àž°'=>'a','àž±'=>'a','àž£àž£'=>'a','àž²'=>'a','ๅ
1710'=>'a','àž³'=>'am','à¹àž²'=>'am',
1711 'àžŽ'=>'i','àžµ'=>'i','àž¶'=>'ue','àžµ'=>'ue','àžž'=>'u','àž¹'=>'u',
1712 'เ'=>'e','แ'=>'ae','โ'=>'o','àž­'=>'o',
1713 'àžµàž¢àž°'=>'ia','àžµàž¢'=>'ia','àž·àž­àž°'=>'uea','àž·àž­'=>'uea','àž±àž§àž°'=>'ua','àž±àž§'=>'ua',
1714 'ใ'=>'ai','ไ'=>'ai','àž±àž¢'=>'ai','àž²àž¢'=>'ai','àž²àž§'=>'ao',
1715 'àžžàž¢'=>'ui','àž­àž¢'=>'oi','àž·àž­àž¢'=>'ueai','àž§àž¢'=>'uai',
1716 'àžŽàž§'=>'io','à¹‡àž§'=>'eo','àžµàž¢àž§'=>'iao',
1717 '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1718 '์'=>'','๎'=>'','ํ'=>'','àžº'=>'',
1719 'ๆ'=>'2','๏'=>'o','àž¯'=>'-','๚'=>'-','๛'=>'-',
1720 '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1721 '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1722
1723 // Korean
1724 'ㄱ'=>'k','ã…
1725‹'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ã…
1726Œ'=>'th','ã„ž'=>'tt','ã…
1727‚'=>'p',
1728 'ã…
1729'=>'ph','ã…
1730ƒ'=>'pp','ã…
1731ˆ'=>'c','ã…
1732Š'=>'ch','ã…
1733‰'=>'cc','ã…
1734…
1735'=>'s','ã…
1736†'=>'ss',
1737 'ã…
1738Ž'=>'h','ã…
1739‡'=>'ng','ã„Ž'=>'n','ㄹ'=>'l','ã…
1740'=>'m', 'ã…
1741'=>'a','ã…
1742“'=>'e','ã…
1743—'=>'o',
1744 'ã…
1745œ'=>'wu','ã…
1746¡'=>'u','ã…
1747£'=>'i','ã…
1748'=>'ay','ã…
1749”'=>'ey','ã…
1750š'=>'oy','ã…
1751˜'=>'wa','ã…
1752'=>'we',
1753 'ã…
1754Ÿ'=>'wi','ã…
1755™'=>'way','ã…
1756ž'=>'wey','ã…
1757¢'=>'uy','ã…
1758‘'=>'ya','ã…
1759•'=>'ye','ã…
1760›'=>'oy',
1761 'ã…
1762 '=>'yu','ã…
1763’'=>'yay','ã…
1764–'=>'yey',
1765);
1766
1767
Note: See TracBrowser for help on using the repository browser.