source: documentation/trunk/packages/dokuwiki-2011-05-25a/inc/parser/lexer.php@ 25027

Last change on this file since 25027 was 25027, checked in by jmt12, 12 years ago

Adding the packages directory, and within it a configured version of dokuwiki all ready to run

File size: 19.7 KB
Line 
1<?php
2/**
3* Author Markus Baker: http://www.lastcraft.com
4* Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5* For an intro to the Lexer see:
6* http://www.phppatterns.com/index.php/article/articleview/106/1/2/
7* @author Marcus Baker
8* @package Doku
9* @subpackage Lexer
10* @version $Id: lexer.php,v 1.1 2005/03/23 23:14:09 harryf Exp $
11*/
12
13/**
14* Init path constant
15*/
16if(!defined('DOKU_INC')) die('meh.');
17
18/**#@+
19 * lexer mode constant
20 */
21define("DOKU_LEXER_ENTER", 1);
22define("DOKU_LEXER_MATCHED", 2);
23define("DOKU_LEXER_UNMATCHED", 3);
24define("DOKU_LEXER_EXIT", 4);
25define("DOKU_LEXER_SPECIAL", 5);
26/**#@-*/
27
28/**
29 * Compounded regular expression. Any of
30 * the contained patterns could match and
31 * when one does it's label is returned.
32 * @package Doku
33 * @subpackage Lexer
34 */
35class Doku_LexerParallelRegex {
36 var $_patterns;
37 var $_labels;
38 var $_regex;
39 var $_case;
40
41 /**
42 * Constructor. Starts with no patterns.
43 * @param boolean $case True for case sensitive, false
44 * for insensitive.
45 * @access public
46 */
47 function Doku_LexerParallelRegex($case) {
48 $this->_case = $case;
49 $this->_patterns = array();
50 $this->_labels = array();
51 $this->_regex = null;
52 }
53
54 /**
55 * Adds a pattern with an optional label.
56 * @param mixed $pattern Perl style regex. Must be UTF-8
57 * encoded. If its a string, the (, )
58 * lose their meaning unless they
59 * form part of a lookahead or
60 * lookbehind assertation.
61 * @param string $label Label of regex to be returned
62 * on a match. Label must be ASCII
63 * @access public
64 */
65 function addPattern($pattern, $label = true) {
66 $count = count($this->_patterns);
67 $this->_patterns[$count] = $pattern;
68 $this->_labels[$count] = $label;
69 $this->_regex = null;
70 }
71
72 /**
73 * Attempts to match all patterns at once against
74 * a string.
75 * @param string $subject String to match against.
76 * @param string $match First matched portion of
77 * subject.
78 * @return boolean True on success.
79 * @access public
80 */
81 function match($subject, &$match) {
82 if (count($this->_patterns) == 0) {
83 return false;
84 }
85 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
86 $match = "";
87 return false;
88 }
89
90 $match = $matches[0];
91 $size = count($matches);
92 for ($i = 1; $i < $size; $i++) {
93 if ($matches[$i] && isset($this->_labels[$i - 1])) {
94 return $this->_labels[$i - 1];
95 }
96 }
97 return true;
98 }
99
100 /**
101 * Attempts to split the string against all patterns at once
102 *
103 * @param string $subject String to match against.
104 * @param array $split The split result: array containing, pre-match, match & post-match strings
105 * @return boolean True on success.
106 * @access public
107 *
108 * @author Christopher Smith <[email protected]>
109 */
110 function split($subject, &$split) {
111 if (count($this->_patterns) == 0) {
112 return false;
113 }
114
115 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
116 if(function_exists('preg_last_error')){
117 $err = preg_last_error();
118 switch($err){
119 case PREG_BACKTRACK_LIMIT_ERROR:
120 msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini',-1);
121 break;
122 case PREG_RECURSION_LIMIT_ERROR:
123 msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini',-1);
124 break;
125 case PREG_BAD_UTF8_ERROR:
126 msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin',-1);
127 break;
128 case PREG_INTERNAL_ERROR:
129 msg('A PCRE internal error occured. This might be caused by a faulty plugin',-1);
130 break;
131 }
132 }
133
134 $split = array($subject, "", "");
135 return false;
136 }
137
138 $idx = count($matches)-2;
139 list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
140 $split = array($pre, $matches[0], $post);
141
142 return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
143 }
144
145 /**
146 * Compounds the patterns into a single
147 * regular expression separated with the
148 * "or" operator. Caches the regex.
149 * Will automatically escape (, ) and / tokens.
150 * @param array $patterns List of patterns in order.
151 * @access private
152 */
153 function _getCompoundedRegex() {
154 if ($this->_regex == null) {
155 $cnt = count($this->_patterns);
156 for ($i = 0; $i < $cnt; $i++) {
157
158 /*
159 * decompose the input pattern into "(", "(?", ")",
160 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
161 * elements.
162 */
163 preg_match_all('/\\\\.|' .
164 '\(\?|' .
165 '[()]|' .
166 '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
167 '[^[()\\\\]+/', $this->_patterns[$i], $elts);
168
169 $pattern = "";
170 $level = 0;
171
172 foreach ($elts[0] as $elt) {
173 /*
174 * for "(", ")" remember the nesting level, add "\"
175 * only to the non-"(?" ones.
176 */
177
178 switch($elt) {
179 case '(':
180 $pattern .= '\(';
181 break;
182 case ')':
183 if ($level > 0)
184 $level--; /* closing (? */
185 else
186 $pattern .= '\\';
187 $pattern .= ')';
188 break;
189 case '(?':
190 $level++;
191 $pattern .= '(?';
192 break;
193 default:
194 if (substr($elt, 0, 1) == '\\')
195 $pattern .= $elt;
196 else
197 $pattern .= str_replace('/', '\/', $elt);
198 }
199 }
200 $this->_patterns[$i] = "($pattern)";
201 }
202 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
203 }
204 return $this->_regex;
205 }
206
207 /**
208 * Accessor for perl regex mode flags to use.
209 * @return string Perl regex flags.
210 * @access private
211 */
212 function _getPerlMatchingFlags() {
213 return ($this->_case ? "msS" : "msSi");
214 }
215}
216
217/**
218 * States for a stack machine.
219 * @package Lexer
220 * @subpackage Lexer
221 */
222class Doku_LexerStateStack {
223 var $_stack;
224
225 /**
226 * Constructor. Starts in named state.
227 * @param string $start Starting state name.
228 * @access public
229 */
230 function Doku_LexerStateStack($start) {
231 $this->_stack = array($start);
232 }
233
234 /**
235 * Accessor for current state.
236 * @return string State.
237 * @access public
238 */
239 function getCurrent() {
240 return $this->_stack[count($this->_stack) - 1];
241 }
242
243 /**
244 * Adds a state to the stack and sets it
245 * to be the current state.
246 * @param string $state New state.
247 * @access public
248 */
249 function enter($state) {
250 array_push($this->_stack, $state);
251 }
252
253 /**
254 * Leaves the current state and reverts
255 * to the previous one.
256 * @return boolean False if we drop off
257 * the bottom of the list.
258 * @access public
259 */
260 function leave() {
261 if (count($this->_stack) == 1) {
262 return false;
263 }
264 array_pop($this->_stack);
265 return true;
266 }
267}
268
269/**
270 * Accepts text and breaks it into tokens.
271 * Some optimisation to make the sure the
272 * content is only scanned by the PHP regex
273 * parser once. Lexer modes must not start
274 * with leading underscores.
275 * @package Doku
276 * @subpackage Lexer
277 */
278class Doku_Lexer {
279 var $_regexes;
280 var $_parser;
281 var $_mode;
282 var $_mode_handlers;
283 var $_case;
284
285 /**
286 * Sets up the lexer in case insensitive matching
287 * by default.
288 * @param Doku_Parser $parser Handling strategy by
289 * reference.
290 * @param string $start Starting handler.
291 * @param boolean $case True for case sensitive.
292 * @access public
293 */
294 function Doku_Lexer(&$parser, $start = "accept", $case = false) {
295 $this->_case = $case;
296 $this->_regexes = array();
297 $this->_parser = &$parser;
298 $this->_mode = new Doku_LexerStateStack($start);
299 $this->_mode_handlers = array();
300 }
301
302 /**
303 * Adds a token search pattern for a particular
304 * parsing mode. The pattern does not change the
305 * current mode.
306 * @param string $pattern Perl style regex, but ( and )
307 * lose the usual meaning.
308 * @param string $mode Should only apply this
309 * pattern when dealing with
310 * this type of input.
311 * @access public
312 */
313 function addPattern($pattern, $mode = "accept") {
314 if (! isset($this->_regexes[$mode])) {
315 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
316 }
317 $this->_regexes[$mode]->addPattern($pattern);
318 }
319
320 /**
321 * Adds a pattern that will enter a new parsing
322 * mode. Useful for entering parenthesis, strings,
323 * tags, etc.
324 * @param string $pattern Perl style regex, but ( and )
325 * lose the usual meaning.
326 * @param string $mode Should only apply this
327 * pattern when dealing with
328 * this type of input.
329 * @param string $new_mode Change parsing to this new
330 * nested mode.
331 * @access public
332 */
333 function addEntryPattern($pattern, $mode, $new_mode) {
334 if (! isset($this->_regexes[$mode])) {
335 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
336 }
337 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
338 }
339
340 /**
341 * Adds a pattern that will exit the current mode
342 * and re-enter the previous one.
343 * @param string $pattern Perl style regex, but ( and )
344 * lose the usual meaning.
345 * @param string $mode Mode to leave.
346 * @access public
347 */
348 function addExitPattern($pattern, $mode) {
349 if (! isset($this->_regexes[$mode])) {
350 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
351 }
352 $this->_regexes[$mode]->addPattern($pattern, "__exit");
353 }
354
355 /**
356 * Adds a pattern that has a special mode. Acts as an entry
357 * and exit pattern in one go, effectively calling a special
358 * parser handler for this token only.
359 * @param string $pattern Perl style regex, but ( and )
360 * lose the usual meaning.
361 * @param string $mode Should only apply this
362 * pattern when dealing with
363 * this type of input.
364 * @param string $special Use this mode for this one token.
365 * @access public
366 */
367 function addSpecialPattern($pattern, $mode, $special) {
368 if (! isset($this->_regexes[$mode])) {
369 $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
370 }
371 $this->_regexes[$mode]->addPattern($pattern, "_$special");
372 }
373
374 /**
375 * Adds a mapping from a mode to another handler.
376 * @param string $mode Mode to be remapped.
377 * @param string $handler New target handler.
378 * @access public
379 */
380 function mapHandler($mode, $handler) {
381 $this->_mode_handlers[$mode] = $handler;
382 }
383
384 /**
385 * Splits the page text into tokens. Will fail
386 * if the handlers report an error or if no
387 * content is consumed. If successful then each
388 * unparsed and parsed token invokes a call to the
389 * held listener.
390 * @param string $raw Raw HTML text.
391 * @return boolean True on success, else false.
392 * @access public
393 */
394 function parse($raw) {
395 if (! isset($this->_parser)) {
396 return false;
397 }
398 $initialLength = strlen($raw);
399 $length = $initialLength;
400 $pos = 0;
401 while (is_array($parsed = $this->_reduce($raw))) {
402 list($unmatched, $matched, $mode) = $parsed;
403 $currentLength = strlen($raw);
404 $matchPos = $initialLength - $currentLength - strlen($matched);
405 if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
406 return false;
407 }
408 if ($currentLength == $length) {
409 return false;
410 }
411 $length = $currentLength;
412 $pos = $initialLength - $currentLength;
413 }
414 if (!$parsed) {
415 return false;
416 }
417 return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
418 }
419
420 /**
421 * Sends the matched token and any leading unmatched
422 * text to the parser changing the lexer to a new
423 * mode if one is listed.
424 * @param string $unmatched Unmatched leading portion.
425 * @param string $matched Actual token match.
426 * @param string $mode Mode after match. A boolean
427 * false mode causes no change.
428 * @param int $pos Current byte index location in raw doc
429 * thats being parsed
430 * @return boolean False if there was any error
431 * from the parser.
432 * @access private
433 */
434 function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
435 if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos) ){
436 return false;
437 }
438 if ($this->_isModeEnd($mode)) {
439 if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
440 return false;
441 }
442 return $this->_mode->leave();
443 }
444 if ($this->_isSpecialMode($mode)) {
445 $this->_mode->enter($this->_decodeSpecial($mode));
446 if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
447 return false;
448 }
449 return $this->_mode->leave();
450 }
451 if (is_string($mode)) {
452 $this->_mode->enter($mode);
453 return $this->_invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
454 }
455 return $this->_invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
456 }
457
458 /**
459 * Tests to see if the new mode is actually to leave
460 * the current mode and pop an item from the matching
461 * mode stack.
462 * @param string $mode Mode to test.
463 * @return boolean True if this is the exit mode.
464 * @access private
465 */
466 function _isModeEnd($mode) {
467 return ($mode === "__exit");
468 }
469
470 /**
471 * Test to see if the mode is one where this mode
472 * is entered for this token only and automatically
473 * leaves immediately afterwoods.
474 * @param string $mode Mode to test.
475 * @return boolean True if this is the exit mode.
476 * @access private
477 */
478 function _isSpecialMode($mode) {
479 return (strncmp($mode, "_", 1) == 0);
480 }
481
482 /**
483 * Strips the magic underscore marking single token
484 * modes.
485 * @param string $mode Mode to decode.
486 * @return string Underlying mode name.
487 * @access private
488 */
489 function _decodeSpecial($mode) {
490 return substr($mode, 1);
491 }
492
493 /**
494 * Calls the parser method named after the current
495 * mode. Empty content will be ignored. The lexer
496 * has a parser handler for each mode in the lexer.
497 * @param string $content Text parsed.
498 * @param boolean $is_match Token is recognised rather
499 * than unparsed data.
500 * @param int $pos Current byte index location in raw doc
501 * thats being parsed
502 * @access private
503 */
504 function _invokeParser($content, $is_match, $pos) {
505 if (($content === "") || ($content === false)) {
506 return true;
507 }
508 $handler = $this->_mode->getCurrent();
509 if (isset($this->_mode_handlers[$handler])) {
510 $handler = $this->_mode_handlers[$handler];
511 }
512
513 // modes starting with plugin_ are all handled by the same
514 // handler but with an additional parameter
515 if(substr($handler,0,7)=='plugin_'){
516 list($handler,$plugin) = explode('_',$handler,2);
517 return $this->_parser->$handler($content, $is_match, $pos, $plugin);
518 }
519
520 return $this->_parser->$handler($content, $is_match, $pos);
521 }
522
523 /**
524 * Tries to match a chunk of text and if successful
525 * removes the recognised chunk and any leading
526 * unparsed data. Empty strings will not be matched.
527 * @param string $raw The subject to parse. This is the
528 * content that will be eaten.
529 * @return array Three item list of unparsed
530 * content followed by the
531 * recognised token and finally the
532 * action the parser is to take.
533 * True if no match, false if there
534 * is a parsing error.
535 * @access private
536 */
537 function _reduce(&$raw) {
538 if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
539 return false;
540 }
541 if ($raw === "") {
542 return true;
543 }
544 if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
545 list($unparsed, $match, $raw) = $split;
546 return array($unparsed, $match, $action);
547 }
548 return true;
549 }
550}
551
552/**
553* Escapes regex characters other than (, ) and /
554* @TODO
555*/
556function Doku_Lexer_Escape($str) {
557 //$str = addslashes($str);
558 $chars = array(
559 '/\\\\/',
560 '/\./',
561 '/\+/',
562 '/\*/',
563 '/\?/',
564 '/\[/',
565 '/\^/',
566 '/\]/',
567 '/\$/',
568 '/\{/',
569 '/\}/',
570 '/\=/',
571 '/\!/',
572 '/\</',
573 '/\>/',
574 '/\|/',
575 '/\:/'
576 );
577
578 $escaped = array(
579 '\\\\\\\\',
580 '\.',
581 '\+',
582 '\*',
583 '\?',
584 '\[',
585 '\^',
586 '\]',
587 '\$',
588 '\{',
589 '\}',
590 '\=',
591 '\!',
592 '\<',
593 '\>',
594 '\|',
595 '\:'
596 );
597 return preg_replace($chars, $escaped, $str);
598}
599
600//Setup VIM: ex: et ts=4 sw=4 :
Note: See TracBrowser for help on using the repository browser.