Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: documentation/trunk/packages/dokuwiki-2011-05-25a/inc/SafeFN.class.php@ 25048

Last change on this file since 25048 was 25027, checked in by jmt12, 12 years ago
Adding the packages directory, and within it a configured version of dokuwiki all ready to run
File size: 6.1 KB

Line
1	<?php
2
3	/**
4	* Class to safely store UTF-8 in a Filename
5	*
6	* Encodes a utf8 string using only the following characters 0-9a-z_.-%
7	* characters 0-9a-z in the original string are preserved, "plain".
8	* all other characters are represented in a substring that starts
9	* with '%' are "converted".
10	* The transition from converted substrings to plain characters is
11	* marked with a '.'
12	*
13	* @author Christopher Smith
14	* @date 2010-04-02
15	*/
16	class SafeFN {
17
18	// 'safe' characters are a superset of $plain, $pre_indicator and $post_indicator
19	private static $plain = '-./[_0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
20	private static $pre_indicator = '%';
21	private static $post_indicator = ']';
22
23	/**
24	* Convert an UTF-8 string to a safe ASCII String
25	*
26	* conversion process
27	* - if codepoint is a plain or post_indicator character,
28	* - if previous character was "converted", append post_indicator to output, clear "converted" flag
29	* - append ascii byte for character to output
30	* (continue to next character)
31	*
32	* - if codepoint is a pre_indicator character,
33	* - append ascii byte for character to output, set "converted" flag
34	* (continue to next character)
35	*
36	* (all remaining characters)
37	* - reduce codepoint value for non-printable ASCII characters (0x00 - 0x1f). Space becomes our zero.
38	* - convert reduced value to base36 (0-9a-z)
39	* - append $pre_indicator characater followed by base36 string to output, set converted flag
40	* (continue to next character)
41	*
42	* @param string $filename a utf8 string, should only include printable characters - not 0x00-0x1f
43	* @return string an encoded representation of $filename using only 'safe' ASCII characters
44	*
45	* @author Christopher Smith <[email protected]>
46	*/
47	public function encode($filename) {
48	return self::unicode_to_safe(utf8_to_unicode($filename));
49	}
50
51	/**
52	* decoding process
53	* - split the string into substrings at any occurrence of pre or post indicator characters
54	* - check the first character of the substring
55	* - if its not a pre_indicator character
56	* - if previous character was converted, skip over post_indicator character
57	* - copy codepoint values of remaining characters to the output array
58	* - clear any converted flag
59	* (continue to next substring)
60	*
61	* _ else (its a pre_indicator character)
62	* - if string length is 1, copy the post_indicator character to the output array
63	* (continue to next substring)
64	*
65	* - else (string length > 1)
66	* - skip the pre-indicator character and convert remaining string from base36 to base10
67	* - increase codepoint value for non-printable ASCII characters (add 0x20)
68	* - append codepoint to output array
69	* (continue to next substring)
70	*
71	* @param string $filename a 'safe' encoded ASCII string,
72	* @return string decoded utf8 representation of $filename
73	*
74	* @author Christopher Smith <[email protected]>
75	*/
76	public function decode($filename) {
77	return unicode_to_utf8(self::safe_to_unicode(strtolower($filename)));
78	}
79
80	public function validate_printable_utf8($printable_utf8) {
81	return !preg_match('#[\x01-\x1f]#',$printable_utf8);
82	}
83
84	public function validate_safe($safe) {
85	return !preg_match('#[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']#',$safe);
86	}
87
88	/**
89	* convert an array of unicode codepoints into 'safe_filename' format
90	*
91	* @param array int $unicode an array of unicode codepoints
92	* @return string the unicode represented in 'safe_filename' format
93	*
94	* @author Christopher Smith <[email protected]>
95	*/
96	private function unicode_to_safe($unicode) {
97
98	$safe = '';
99	$converted = false;
100
101	foreach ($unicode as $codepoint) {
102	if ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)) {
103	if ($converted) {
104	$safe .= self::$post_indicator;
105	$converted = false;
106	}
107	$safe .= chr($codepoint);
108
109	} else if ($codepoint == ord(self::$pre_indicator)) {
110	$safe .= self::$pre_indicator;
111	$converted = true;
112	} else {
113	$safe .= self::$pre_indicator.base_convert((string)($codepoint-32),10,36);
114	$converted = true;
115	}
116	}
117	if($converted) $safe .= self::$post_indicator;
118	return $safe;
119	}
120
121	/**
122	* convert a 'safe_filename' string into an array of unicode codepoints
123	*
124	* @param string $safe a filename in 'safe_filename' format
125	* @return array int an array of unicode codepoints
126	*
127	* @author Christopher Smith <[email protected]>
128	*/
129	private function safe_to_unicode($safe) {
130
131	$unicode = array();
132	$split = preg_split('#(?=['.self::$post_indicator.self::$pre_indicator.'])#',$safe,-1,PREG_SPLIT_NO_EMPTY);
133
134	$converted = false;
135	foreach ($split as $sub) {
136	if ($sub[0] != self::$pre_indicator) {
137	// plain (unconverted) characters, optionally starting with a post_indicator
138	// set initial value to skip any post_indicator
139	for ($i=($converted?1:0); $i < strlen($sub); $i++) {
140	$unicode[] = ord($sub[$i]);
141	}
142	$converted = false;
143	} else if (strlen($sub)==1) {
144	// a pre_indicator character in the real data
145	$unicode[] = ord($sub);
146	$converted = true;
147	} else {
148	// a single codepoint in base36, adjusted for initial 32 non-printable chars
149	$unicode[] = 32 + (int)base_convert(substr($sub,1),36,10);
150	$converted = true;
151	}
152	}
153
154	return $unicode;
155	}
156
157	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: