1 | <?php
|
---|
2 |
|
---|
3 | /**
|
---|
4 | * Class to safely store UTF-8 in a Filename
|
---|
5 | *
|
---|
6 | * Encodes a utf8 string using only the following characters 0-9a-z_.-%
|
---|
7 | * characters 0-9a-z in the original string are preserved, "plain".
|
---|
8 | * all other characters are represented in a substring that starts
|
---|
9 | * with '%' are "converted".
|
---|
10 | * The transition from converted substrings to plain characters is
|
---|
11 | * marked with a '.'
|
---|
12 | *
|
---|
13 | * @author Christopher Smith
|
---|
14 | * @date 2010-04-02
|
---|
15 | */
|
---|
16 | class SafeFN {
|
---|
17 |
|
---|
18 | // 'safe' characters are a superset of $plain, $pre_indicator and $post_indicator
|
---|
19 | private static $plain = '-./[_0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
|
---|
20 | private static $pre_indicator = '%';
|
---|
21 | private static $post_indicator = ']';
|
---|
22 |
|
---|
23 | /**
|
---|
24 | * Convert an UTF-8 string to a safe ASCII String
|
---|
25 | *
|
---|
26 | * conversion process
|
---|
27 | * - if codepoint is a plain or post_indicator character,
|
---|
28 | * - if previous character was "converted", append post_indicator to output, clear "converted" flag
|
---|
29 | * - append ascii byte for character to output
|
---|
30 | * (continue to next character)
|
---|
31 | *
|
---|
32 | * - if codepoint is a pre_indicator character,
|
---|
33 | * - append ascii byte for character to output, set "converted" flag
|
---|
34 | * (continue to next character)
|
---|
35 | *
|
---|
36 | * (all remaining characters)
|
---|
37 | * - reduce codepoint value for non-printable ASCII characters (0x00 - 0x1f). Space becomes our zero.
|
---|
38 | * - convert reduced value to base36 (0-9a-z)
|
---|
39 | * - append $pre_indicator characater followed by base36 string to output, set converted flag
|
---|
40 | * (continue to next character)
|
---|
41 | *
|
---|
42 | * @param string $filename a utf8 string, should only include printable characters - not 0x00-0x1f
|
---|
43 | * @return string an encoded representation of $filename using only 'safe' ASCII characters
|
---|
44 | *
|
---|
45 | * @author Christopher Smith <[email protected]>
|
---|
46 | */
|
---|
47 | public function encode($filename) {
|
---|
48 | return self::unicode_to_safe(utf8_to_unicode($filename));
|
---|
49 | }
|
---|
50 |
|
---|
51 | /**
|
---|
52 | * decoding process
|
---|
53 | * - split the string into substrings at any occurrence of pre or post indicator characters
|
---|
54 | * - check the first character of the substring
|
---|
55 | * - if its not a pre_indicator character
|
---|
56 | * - if previous character was converted, skip over post_indicator character
|
---|
57 | * - copy codepoint values of remaining characters to the output array
|
---|
58 | * - clear any converted flag
|
---|
59 | * (continue to next substring)
|
---|
60 | *
|
---|
61 | * _ else (its a pre_indicator character)
|
---|
62 | * - if string length is 1, copy the post_indicator character to the output array
|
---|
63 | * (continue to next substring)
|
---|
64 | *
|
---|
65 | * - else (string length > 1)
|
---|
66 | * - skip the pre-indicator character and convert remaining string from base36 to base10
|
---|
67 | * - increase codepoint value for non-printable ASCII characters (add 0x20)
|
---|
68 | * - append codepoint to output array
|
---|
69 | * (continue to next substring)
|
---|
70 | *
|
---|
71 | * @param string $filename a 'safe' encoded ASCII string,
|
---|
72 | * @return string decoded utf8 representation of $filename
|
---|
73 | *
|
---|
74 | * @author Christopher Smith <[email protected]>
|
---|
75 | */
|
---|
76 | public function decode($filename) {
|
---|
77 | return unicode_to_utf8(self::safe_to_unicode(strtolower($filename)));
|
---|
78 | }
|
---|
79 |
|
---|
80 | public function validate_printable_utf8($printable_utf8) {
|
---|
81 | return !preg_match('#[\x01-\x1f]#',$printable_utf8);
|
---|
82 | }
|
---|
83 |
|
---|
84 | public function validate_safe($safe) {
|
---|
85 | return !preg_match('#[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']#',$safe);
|
---|
86 | }
|
---|
87 |
|
---|
88 | /**
|
---|
89 | * convert an array of unicode codepoints into 'safe_filename' format
|
---|
90 | *
|
---|
91 | * @param array int $unicode an array of unicode codepoints
|
---|
92 | * @return string the unicode represented in 'safe_filename' format
|
---|
93 | *
|
---|
94 | * @author Christopher Smith <[email protected]>
|
---|
95 | */
|
---|
96 | private function unicode_to_safe($unicode) {
|
---|
97 |
|
---|
98 | $safe = '';
|
---|
99 | $converted = false;
|
---|
100 |
|
---|
101 | foreach ($unicode as $codepoint) {
|
---|
102 | if ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)) {
|
---|
103 | if ($converted) {
|
---|
104 | $safe .= self::$post_indicator;
|
---|
105 | $converted = false;
|
---|
106 | }
|
---|
107 | $safe .= chr($codepoint);
|
---|
108 |
|
---|
109 | } else if ($codepoint == ord(self::$pre_indicator)) {
|
---|
110 | $safe .= self::$pre_indicator;
|
---|
111 | $converted = true;
|
---|
112 | } else {
|
---|
113 | $safe .= self::$pre_indicator.base_convert((string)($codepoint-32),10,36);
|
---|
114 | $converted = true;
|
---|
115 | }
|
---|
116 | }
|
---|
117 | if($converted) $safe .= self::$post_indicator;
|
---|
118 | return $safe;
|
---|
119 | }
|
---|
120 |
|
---|
121 | /**
|
---|
122 | * convert a 'safe_filename' string into an array of unicode codepoints
|
---|
123 | *
|
---|
124 | * @param string $safe a filename in 'safe_filename' format
|
---|
125 | * @return array int an array of unicode codepoints
|
---|
126 | *
|
---|
127 | * @author Christopher Smith <[email protected]>
|
---|
128 | */
|
---|
129 | private function safe_to_unicode($safe) {
|
---|
130 |
|
---|
131 | $unicode = array();
|
---|
132 | $split = preg_split('#(?=['.self::$post_indicator.self::$pre_indicator.'])#',$safe,-1,PREG_SPLIT_NO_EMPTY);
|
---|
133 |
|
---|
134 | $converted = false;
|
---|
135 | foreach ($split as $sub) {
|
---|
136 | if ($sub[0] != self::$pre_indicator) {
|
---|
137 | // plain (unconverted) characters, optionally starting with a post_indicator
|
---|
138 | // set initial value to skip any post_indicator
|
---|
139 | for ($i=($converted?1:0); $i < strlen($sub); $i++) {
|
---|
140 | $unicode[] = ord($sub[$i]);
|
---|
141 | }
|
---|
142 | $converted = false;
|
---|
143 | } else if (strlen($sub)==1) {
|
---|
144 | // a pre_indicator character in the real data
|
---|
145 | $unicode[] = ord($sub);
|
---|
146 | $converted = true;
|
---|
147 | } else {
|
---|
148 | // a single codepoint in base36, adjusted for initial 32 non-printable chars
|
---|
149 | $unicode[] = 32 + (int)base_convert(substr($sub,1),36,10);
|
---|
150 | $converted = true;
|
---|
151 | }
|
---|
152 | }
|
---|
153 |
|
---|
154 | return $unicode;
|
---|
155 | }
|
---|
156 |
|
---|
157 | }
|
---|