source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/wiki2xml.php@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 42.7 KB
Line 
1<?php
2# Copyright by Magnus Manske (2005)
3# Released under GPL
4
5$wiki2xml_authors = array () ;
6
7class wiki2xml
8 {
9 var $cnt = 0 ; # For debugging purposes
10 var $protocols = array ( "http" , "https" , "news" , "ftp" , "irc" , "mailto" ) ;
11 var $errormessage = "ERROR!" ;
12 var $compensate_markup_errors = true;
13 var $auto_fill_templates = 'all' ; # Will try and replace templates right inline, instead of using <template> tags; requires global $content_provider
14 var $use_space_tag = true ; # Use <space/> instead of spaces before and after tags
15 var $allowed = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890 +-#:;.,%="\'\\' ;
16 var $directhtmltags = array (
17 "b" => "xhtml:b",
18 "i" => "xhtml:i",
19 "u" => "xhtml:u",
20 "s" => "xhtml:s",
21 "p" => "xhtml:p",
22 "br" => "xhtml:br",
23 "em" => "xhtml:em",
24 "div" => "xhtml:div",
25 "span" => "xhtml:span",
26 "big" => "xhtml:big",
27 "small" => "xhtml:small",
28 "sub" => "xhtml:sub",
29 "sup" => "xhtml:sup",
30 "font" => "xhtml:font",
31 "center" => "xhtml:center",
32 "table" => "xhtml:table",
33 "tr" => "xhtml:tr",
34 "th" => "xhtml:th",
35 "td" => "xhtml:td",
36 "pre" => "xhtml:pre",
37 "code" => "xhtml:code",
38 "caption" => "xhtml:caption",
39 "cite" => "xhtml:cite",
40 "ul" => "xhtml:ul",
41 "ol" => "xhtml:ol",
42 "li" => "xhtml:li",
43 "tt" => "xhtml:tt",
44 "h1" => "xhtml:h1",
45 "h2" => "xhtml:h2",
46 "h3" => "xhtml:h3",
47 "h4" => "xhtml:h4",
48 "h5" => "xhtml:h5",
49 "h6" => "xhtml:h6",
50 "h7" => "xhtml:h7",
51 "h8" => "xhtml:h8",
52 "h9" => "xhtml:h9",
53 ) ;
54
55 var $w ; # The wiki text
56 var $wl ; # The wiki text length
57 var $bold_italics ;
58 var $tables = array () ; # List of open tables
59 var $profile = array () ;
60
61 # Some often used functions
62
63 /**
64 * Inherit settings from an existing parser
65 */
66 function inherit ( &$base )
67 {
68 $this->protocols = $base->protocols ;
69 $this->errormessage = $base->errormessage ;
70 $this->compensate_markup_errors = $base->compensate_markup_errors ;
71 $this->auto_fill_templates = $base->auto_fill_templates ;
72 $this->use_space_tag = $base->use_space_tag ;
73 $this->compensate_markup_errors = $base->compensate_markup_errors ;
74 $this->allowed = $base->allowed ;
75 $this->directhtmltags = $base->directhtmltags ;
76 }
77
78 /**
79 * Matches a function to the current text (default:once)
80 */
81 function once ( &$a , &$xml , $f , $atleastonce = true , $many = false )
82 {
83 $f = "p_{$f}" ;
84 $cnt = 0 ;
85# print $f . " : " . htmlentities ( substr ( $this->w , $a , 20 ) ) . "<br/>" ; flush () ;
86# if ( !isset ( $this->profile[$f] ) ) $this->profile[$f] = 0 ; # PROFILING
87 do {
88# $this->profile[$f]++ ; # PROFILING
89 $matched = $this->$f ( $a , $xml ) ;
90 if ( $matched && $many ) $again = true ;
91 else $again = false ;
92 if ( $matched ) $cnt++ ;
93 } while ( $again ) ;
94 if ( !$atleastonce ) return true ;
95 if ( $cnt > 0 ) return true ;
96 return false ;
97 }
98
99 function onceormore ( &$a , &$xml , $f )
100 {
101 return $this->once ( $a , $xml , $f , true , true ) ;
102 }
103
104 function nextis ( &$a , $t , $movecounter = true )
105 {
106 if ( substr ( $this->w , $a , strlen ( $t ) ) != $t ) return false ;
107 if ( $movecounter ) $a += strlen ( $t ) ;
108 return true ;
109 }
110
111 function nextchar ( &$a , &$x )
112 {
113 if ( $a >= $this->wl ) return false ;
114 $x .= htmlspecialchars ( $this->w[$a] ) ;
115 $a++ ;
116 return true ;
117 }
118
119 function ischaracter ( $c )
120 {
121 if ( $c >= 'A' && $c <= 'Z' ) return true ;
122 if ( $c >= 'a' && $c <= 'z' ) return true ;
123 return false ;
124 }
125
126 function skipblanks ( &$a , $blank = " " )
127 {
128 while ( $a < $this->wl )
129 {
130 if ( $this->w[$a] != $blank ) return ;
131 $a++ ;
132 }
133 }
134
135 ##############
136
137
138 function p_internal_link_target ( &$a , &$xml , $closeit = "]]" )
139 {
140 return $this->p_internal_link_text ( $a , $xml , true , $closeit ) ;
141 }
142
143 function p_internal_link_text2 ( &$a , &$xml , $closeit )
144 {
145 $bi = $this->bold_italics ;
146 $ret = $this->p_internal_link_text ( $a , $xml , false , $closeit , false ) ;
147 if ( $closeit == ']]' && '' != $this->bold_italics ) $ret = false ; # Dirty hack to ensure good XML; FIXME!!!
148 return $ret ;
149 }
150
151 function p_internal_link_text ( &$a , &$xml , $istarget = false , $closeit = "]]" , $mark = true )
152 {
153 $b = $a ;
154 $x = "" ;
155 if ( $b >= $this->wl ) return false ;
156 $bi = $this->bold_italics ;
157 $this->bold_italics = '' ;
158 $closeit1 = $closeit[0] ;
159 while ( 1 )
160 {
161 if ( $b >= $this->wl ) {
162 $this->bold_italics = $bi ;
163 return false ;
164 }
165 $c = $this->w[$b] ;
166 if ( $closeit != "}}" && $c == "\n" ) {
167 $this->bold_italics = $bi ;
168 return false ;
169 }
170 if ( $c == "|" ) break ;
171 if ( $c == $closeit1 && $this->nextis ( $b , $closeit , false ) ) break ;
172 if ( !$istarget ) {
173 if ( $c == "[" && $this->once ( $b , $x , "internal_link" ) ) continue ;
174 if ( $c == "[" && $this->once ( $b , $x , "external_link" ) ) continue ;
175 if ( $c == "{" && $this->once ( $b , $x , "template_variable" ) ) continue ;
176 if ( $c == "{" && $this->once ( $b , $x , "template" ) ) continue ;
177 if ( $c == "<" && $this->once ( $b , $x , "html" ) ) continue ;
178 if ( $c == "'" && $this->p_bold ( $b , $x , "internal_link_text2" , $closeit ) ) { break ; }
179 if ( $c == "'" && $this->p_italics ( $b , $x , "internal_link_text2" , $closeit ) ) { break ; }
180 if ( $b + 10 < $this->wl &&
181 ( $this->w[$a+5] == '/' && $this->w[$a+7] == '/' ) &&
182 $this->once ( $b , $x , "external_freelink" ) ) continue ;
183 } else {
184 if ( $c == "{" && $this->once ( $b , $x , "template" ) ) continue ;
185 }
186 $x .= htmlspecialchars ( $c ) ;
187 $b++ ;
188/* if ( $b >= $this->wl ) {
189 $this->bold_italics = $bi ;
190 return false ;
191 }*/
192 }
193
194 if ( $closeit == "}}" && !$istarget ) {
195 $xml .= substr ( $this->w , $a , $b - $a ) ;
196 $a = $b ;
197 $this->bold_italics = $bi ;
198 return true ;
199 }
200
201 $x = trim ( str_replace ( "\n" , "" , $x ) ) ;
202 if ( $mark )
203 {
204 if ( $istarget ) $xml .= "<target>{$x}</target>" ;
205 else $xml .= "<part>{$x}</part>" ;
206
207 }
208 else $xml .= $x ;
209 $a = $b ;
210 $this->bold_italics = $bi ;
211 return true ;
212 }
213
214 function p_internal_link_trail ( &$a , &$xml )
215 {
216 $b = $a ;
217 $x = "" ;
218 while ( 1 )
219 {
220 $c = "" ;
221
222 if ( !$this->nextchar ( $b , $c ) ) break ;
223
224 if ( $this->ischaracter ( $c ) )
225 {
226 $x .= $c ;
227 }
228 else
229 {
230 $b-- ;
231 break ;
232 }
233 }
234 if ( $x == "" ) return false ; # No link trail
235 $xml .= "<trail>{$x}</trail>" ;
236 $a = $b ;
237 return true ;
238 }
239
240 function p_internal_link ( &$a , &$xml )
241 {
242 $x = "" ;
243 $b = $a ;
244 if ( !$this->nextis ( $b , "[[" ) ) return false ;
245 if ( !$this->p_internal_link_target ( $b , $x , "]]" ) ) return false ;
246 while ( 1 )
247 {
248 if ( $this->nextis ( $b , "]]" ) ) break ;
249 if ( !$this->nextis ( $b , "|" ) ) return false ;
250 if ( !$this->p_internal_link_text ( $b , $x , false , "]]" ) ) return false ;
251 }
252 $this->p_internal_link_trail ( $b , $x ) ;
253 $xml .= "<link>{$x}</link>" ;
254 $a = $b ;
255 return true ;
256 }
257
258 function p_magic_variable ( &$a , &$xml )
259 {
260 $x = "" ;
261 $b = $a ;
262 if ( !$this->nextis ( $b , "__" ) ) return false ;
263 $varname = "" ;
264 for ( $c = $b ; $c < $this->wl && $this->w[$c] != '_' ; $c++ )
265 $varname .= $this->w[$c] ;
266 if ( !$this->nextis ( $c , "__" ) ) return false ;
267 $xml .= "<magic_variable>{$varname}</magic_variable>" ;
268 $a = $c ;
269 return true ;
270 }
271
272 # Template and template variable, utilizing parts of the internal link methods
273 function p_template ( &$a , &$xml )
274 {
275 global $content_provider , $xmlg ;
276 if ( $xmlg["useapi"] ) return false ; # API already resolved templates
277
278 $x = "" ;
279 $b = $a ;
280 if ( !$this->nextis ( $b , "{{" ) ) return false ;
281# if ( $this->nextis ( $b , "{" , false ) ) return false ; # Template names may not start with "{"
282 if ( !$this->p_internal_link_target ( $b , $x , "}}" ) ) return false ;
283 $target = $x ;
284 $variables = array () ;
285 $varnames = array () ;
286 $vcount = 1 ;
287 while ( 1 )
288 {
289 if ( $this->nextis ( $b , "}}" ) ) break ;
290 if ( !$this->nextis ( $b , "|" ) ) return false ;
291 $l1 = strlen ( $x ) ;
292 if ( !$this->p_internal_link_text ( $b , $x , false , "}}" ) ) return false ;
293 $v = substr ( $x , $l1 ) ;
294 $v = explode ( "=" , $v ) ;
295 if ( count ( $v ) < 2 ) $vk = $vcount ;
296 else {
297 $vk = trim ( array_shift ( $v ) ) ;
298 $varnames[$vcount] = $vk;
299 }
300 $vv = array_shift ( $v ) ;
301 $variables[$vk] = $vv ;
302 if ( !isset ( $variables[$vcount] ) ) $variables[$vcount] = $vv ;
303 $vcount++ ;
304 }
305
306 $target = array_pop ( @explode ( ">" , $target , 2 ) ) ;
307 $target = array_shift ( @explode ( "<" , $target , 2 ) ) ;
308 if ( $this->auto_fill_templates == 'all' ) $replace_template = true ;
309 else if ( $this->auto_fill_templates == 'none' ) $replace_template = false ;
310 else {
311 $found = in_array ( ucfirst ( $target ) , $this->template_list ) ;
312 if ( $found AND $this->auto_fill_templates == 'these' ) $replace_template = true ;
313 else if ( !$found AND $this->auto_fill_templates == 'notthese' ) $replace_template = true ;
314 else $replace_template = false ;
315 }
316
317 if ( substr ( $target , 0 , 1 ) == '#' ) { # Try template logic
318 $between = $this->process_template_logic ( $target , $variables ) ;
319 # Change source (!)
320 $w1 = substr ( $this->w , 0 , $a ) ;
321 $w2 = substr ( $this->w , $b ) ;
322 $this->w = $w1 . $between . $w2 ;
323 $this->wl = strlen ( $this->w ) ;
324 } else if ( $replace_template ) { # Do not generate <template> sections, but rather replace the template call with the template text
325
326 # Get template text
327 $between = trim ( $content_provider->get_template_text ( $target ) ) ;
328 add_authors ( $content_provider->authors ) ;
329
330 # Removing <noinclude> stuff
331 $between = preg_replace( '?<noinclude>.*</noinclude>?msU', '', $between);
332 $between = str_replace ( "<include>" , "" , $between ) ;
333 $between = str_replace ( "</include>" , "" , $between ) ;
334 $between = str_replace ( "<includeonly>" , "" , $between ) ;
335 $between = str_replace ( "</includeonly>" , "" , $between ) ;
336
337 # Remove HTML comments
338 $between = str_replace ( "-->\n" , "-->" , $between ) ;
339 $between = preg_replace( '?<!--.*-->?msU', '', $between) ;
340
341 # Replacing template variables.
342 # ATTENTION: Template variables within <nowiki> sections of templates will be replaced as well!
343
344 if ( $a > 0 && substr ( $between , 0 , 2 ) == '{|' )
345 $between = "\n" . $between ;
346
347 $this->replace_template_variables ( $between , $variables ) ;
348
349 # Change source (!)
350 $w1 = substr ( $this->w , 0 , $a ) ;
351 $w2 = substr ( $this->w , $b ) ;
352 $this->w = $w1 . $between . $w2 ;
353 $this->wl = strlen ( $this->w ) ;
354 } else {
355 $xml .= "<template><target>{$target}</target>";
356 for ( $i = 1 ; $i < $vcount ; $i++ ) {
357 if ( isset( $varnames[$i] ) ) $xml .= "<arg name=\"{$varnames[$i]}\">{$variables[$i]}</arg>";
358 else $xml .= "<arg>{$variables[$i]}</arg>";
359 }
360 $xml .= "</template>" ;
361 $a = $b ;
362 }
363 return true ;
364 }
365
366 function process_template_logic ( $title , $variables ) {
367
368 # TODO : Process title and variables for sub-template-replacements
369
370 if ( substr ( $title , 0 , 4 ) == "#if:" ) {
371 $title = trim ( substr ( $title , 4 ) ) ;
372 if ( $title == '' ) return array_pop ( $variables ) ; # ELSE
373 return array_shift ( $variables ) ; # THEN
374 }
375
376 if ( substr ( $title , 0 , 8 ) == "#switch:" ) {
377 $title = trim ( array_pop ( explode ( ':' , $title , 2 ) ) ) ;
378 foreach ( $variables AS $v ) {
379 $v = explode ( '=' , $v , 2 ) ;
380 $key = trim ( array_shift ( $v ) ) ;
381 if ( $key != $title ) continue ; # Wrong key
382 return array_pop ( $v ) ; # Correct key, return value
383 }
384 }
385
386 # BAD PARSER FUNCTION! Ignoring...
387 return $title ;
388 }
389
390 function replace_template_variables ( &$text , &$variables ) {
391 global $xmlg ;
392 if ( $xmlg["useapi"] ) return false ; # API already resolved templates
393 for ( $a = 0 ; $a+3 < strlen ( $text ) ; $a++ ) {
394 if ( $text[$a] != '{' ) continue ;
395 while ( $this->p_template_replace_single_variable ( $text , $a , $variables ) ) ;
396 }
397 }
398
399 function p_template_replace_single_variable ( &$text , $a , &$variables ) {
400 if ( substr ( $text , $a , 3 ) != '{{{' ) return false ;
401 $b = $a + 3 ;
402
403 # Name
404 $start = $b ;
405 $count = 0 ;
406 while ( $b < strlen ( $text ) && ( $text[$b] != '|' || $count > 0 ) && ( substr ( $text , $b , 3 ) != '}}}' || $count > 0 ) ) {
407 if ( $this->p_template_replace_single_variable ( $text , $b , $variables ) ) continue ;
408 if ( $text[$b] == '{' ) $count++ ;
409 if ( $text[$b] == '}' ) $count-- ;
410 $b++ ;
411 }
412 if ( $b >= strlen ( $text ) ) return false ;
413 $name = trim ( substr ( $text , $start , $b - $start ) ) ;
414
415 # Default value
416 $value = "" ;
417 if ( $text[$b] == '|' ) {
418 $b++ ;
419 $start = $b ;
420 $count = 0 ;
421 while ( $b < strlen ( $text ) && ( substr ( $text , $b , 3 ) != '}}}' || $count > 0 ) ) {
422 if ( $this->p_template_replace_single_variable ( $text , $b , $variables ) ) continue ;
423 if ( $text[$b] == '{' ) $count++ ;
424 if ( $text[$b] == '}' ) $count-- ;
425 $b++ ;
426 }
427 if ( $b >= strlen ( $text ) ) return false ;
428 $value = trim ( substr ( $text , $start , $b - $start ) ) ;#$start - $b - 1 ) ) ;
429 }
430
431 // Replace
432 $b += 3 ; # }}}
433 if ( isset ( $variables[$name] ) ) {
434 $value = $variables[$name] ;
435 }
436 $text = substr ( $text , 0 , $a ) . $value . substr ( $text , $b ) ;
437
438 return true ;
439 }
440
441 function p_template_variable ( &$a , &$xml )
442 {
443 $x = "" ;
444 $b = $a ;
445 if ( !$this->nextis ( $b , "{{{" ) ) return false ;
446 if ( !$this->p_internal_link_text ( $b , $x , false , "}}}" ) ) return false ;
447 if ( !$this->nextis ( $b , "}}}" ) ) return false ;
448 $xml .= "<templatevar>{$x}</templatevar>" ;
449 $a = $b ;
450 return true ;
451 }
452
453 # Bold / italics
454 function p_bold ( &$a , &$xml , $recurse = "restofline" , $end = "" )
455 {
456 return $this->p_intwined ( $a , $xml , "bold" , "'''" , $recurse , $end ) ;
457 }
458
459 function p_italics ( &$a , &$xml , $recurse = "restofline" , $end = "" )
460 {
461 return $this->p_intwined ( $a , $xml , "italics" , "''" , $recurse , $end ) ;
462 }
463
464 function p_intwined ( &$a , &$xml , $tag , $markup , $recurse , $end )
465 {
466 $b = $a ;
467 if ( !$this->nextis ( $b , $markup ) ) return false ;
468 $id = substr ( ucfirst ( $tag ) , 0 , 1 ) ;
469 $bi = $this->bold_italics ;
470 $open = false ;
471 if ( substr ( $this->bold_italics , -1 ) == $id )
472 {
473 $x = "</{$tag}>" ;
474 $this->bold_italics = substr ( $this->bold_italics , 0 , -1 ) ;
475 }
476 else
477 {
478 $pos = strpos ( $this->bold_italics , $id ) ;
479 if ( false !== $pos ) return false ; # Can't close a tag that ain't open
480 $open = true ;
481 $x = "<{$tag}>" ;
482 $this->bold_italics .= $id ;
483 }
484
485 if ( $end == "" )
486 {
487 $res = $this->once ( $b , $x , $recurse ) ;
488 }
489 else
490 {
491 $r = "p_{$recurse}" ;
492 $res = $this->$r ( $b , $x , $end ) ;
493 }
494
495 $this->bold_italics = $bi ;
496 if ( !$res )
497 {
498 return false ;
499 }
500 $xml .= $x ;
501 $a = $b ;
502 return true ;
503 }
504
505 function scanplaintext ( &$a , &$xml , $goodstop , $badstop )
506 {
507 $b = $a ;
508 $x = "" ;
509 while ( $b < $this->wl )
510 {
511 if ( $this->w[$b] == "{" && $this->once ( $b , $x , "template" ) ) continue ;
512 foreach ( $goodstop AS $s )
513 if ( $this->nextis ( $b , $s , false ) ) break 2 ;
514 foreach ( $badstop AS $s )
515 if ( $this->nextis ( $b , $s , false ) ) return false ;
516 $c = $this->w[$b] ;
517 $x .= htmlspecialchars ( $c ) ;
518 $b++ ;
519 }
520 if ( count ( $goodstop ) > 0 && $b >= $this->wl ) return false ; # Reached end; not good
521 $a = $b ;
522 $xml .= $x ;
523 return true ;
524 }
525
526 # External link
527 function p_external_freelink ( &$a , &$xml , $mark = true )
528 {
529 if ( $this->wl <= $a + 10 ) return false ; # Can't have an URL shorter than that
530 if ( $this->w[$a+5] != '/' && $this->w[$a+7] != '/' ) return false ; # Shortcut for protocols 3-6 chars length
531 $protocol = "" ;
532 $b = $a ;
533# while ( $this->w[$b] == "{" && $this->once ( $b , $x , "template" ) ) $b = $a ;
534 foreach ( $this->protocols AS $p )
535 {
536 if ( $this->nextis ( $b , $p . "://" ) )
537 {
538 $protocol = $p ;
539 break ;
540 }
541 }
542 if ( $protocol == "" ) return false ;
543 $x = "{$protocol}://" ;
544 while ( $b < $this->wl )
545 {
546 $c = $this->w[$b] ;
547 if ( $c == "{" && $this->once ( $b , $x , "template" ) ) continue ;
548 if ( $c == "\n" || $c == " " || $c == '|' ) break ;
549 if ( !$mark && $c == "]" ) break ;
550 $x .= htmlspecialchars ( $c ) ;
551 $b++ ;
552 }
553 if ( substr ( $x , -1 ) == "." || substr ( $x , -1 ) == "," )
554 {
555 $x = substr ( $x , 0 , -1 ) ;
556 $b-- ;
557 }
558 $a = $b ;
559 $x = htmlspecialchars ( $x , ENT_QUOTES ) ;
560 if ( $mark ) $xml .= "<link type='external' href='$x'/>" ;
561 else $xml .= $x ;
562 return true ;
563 }
564
565 function p_external_link ( &$a , &$xml , $mark = true )
566 {
567 $b = $a ;
568 if ( !$this->nextis ( $b , "[" ) ) return false ;
569 $url = "" ;
570 $c = $b ;
571 $x = "" ;
572 while ( $c < $this->wl && $this->w[$c] == "{" && $this->once ( $c , $x , "template" ) ) $c = $b ;
573 if ( $c >= $this->wl ) return false ;
574 $x = "" ;
575 if ( !$this->p_external_freelink ( $b , $url , false ) ) return false ;
576 $this->skipblanks ( $b ) ;
577 if ( !$this->scanplaintext ( $b , $x , array ( "]" ) , array ( "\n" ) ) ) return false ;
578 $a = $b + 1 ;
579 $xml .= "<link type='external' href='{$url}'>{$x}</link>" ;
580 return true ;
581 }
582
583 # Heading
584 function p_heading ( &$a , &$xml )
585 {
586 if ( $a >= $this->wl || $this->w[$a] != '=' ) return false ;
587 $b = $a ;
588 $level = 0 ;
589 $h = "" ;
590 $x = "" ;
591 while ( $this->nextis ( $b , "=" ) )
592 {
593 $level++ ;
594 $h .= "=" ;
595 }
596 $this->skipblanks ( $b ) ;
597 if ( !$this->once ( $b , $x , "restofline" ) ) return false ;
598 if ( $this->compensate_markup_errors ) $x = trim ( $x ) ;
599 else if ( $x != trim ( $x ) ) $xml .= "<error type='heading' reason='trailing blank'/>" ;
600 if ( substr ( $x , -$level ) != $h ) return false ; # No match
601
602 $x = trim ( substr ( $x , 0 , -$level ) ) ;
603 $level -= 1 ;
604 $a = $b ;
605 $xml .= "<heading level='" . ($level+1) . "'>{$x}</heading>" ;
606 return true ;
607 }
608
609 # Line
610 # Often used function for parsing the rest of a text line
611 function p_restofline ( &$a , &$xml , $closeit = array() )
612 {
613 $b = $a ;
614 $x = "" ;
615 $override = false ;
616 while ( $b < $this->wl && !$override )
617 {
618 $c = $this->w[$b] ;
619 if ( $c == "\n" ) { $b++ ; break ; }
620 foreach ( $closeit AS $z )
621 if ( $this->nextis ( $b , $z , false ) ) break ;
622 if ( $c == "_" && $this->once ( $b , $x , "magic_variable" ) ) continue ;
623 if ( $c == "[" && $this->once ( $b , $x , "internal_link" ) ) continue ;
624 if ( $c == "[" && $this->once ( $b , $x , "external_link" ) ) continue ;
625 if ( $c == "{" && $this->once ( $b , $x , "template_variable" ) ) continue ;
626 if ( $c == "{" && $this->once ( $b , $x , "template" ) ) continue ;
627 if ( $c == "{" && $this->p_table ( $b , $x ) ) continue ;
628 if ( $c == "<" && $this->once ( $b , $x , "html" ) ) continue ;
629 if ( $c == "'" && $this->once ( $b , $x , "bold" ) ) { $override = true ; break ; }
630 if ( $c == "'" && $this->once ( $b , $x , "italics" ) ) { $override = true ; break ; }
631 if ( $b + 10 < $this->wl &&
632 ( $this->w[$a+5] == '/' && $this->w[$a+7] == '/' ) &&
633 $this->once ( $b , $x , "external_freelink" ) ) continue ;
634
635 # Just an ordinary character
636 $x .= htmlspecialchars ( $c ) ;
637 $b++ ;
638 if ( $b >= $this->wl ) break ;
639 }
640 if ( !$override && $this->bold_italics != "" )
641 {
642 return false ;
643 }
644 $xml .= $x ;
645 $a = $b ;
646 return true ;
647 }
648
649 function p_line ( &$a , &$xml , $force )
650 {
651 if ( $a >= $this->wl ) return false ; # Already at the end of the text
652 $c = $this->w[$a] ;
653 if ( !$force )
654 {
655 if ( $c == '*' || $c == ':' || $c == '#' || $c == ';' || $c == ' ' || $c == "\n" ) return false ; # Not a suitable beginning
656 if ( $this->nextis ( $a , "{|" , false ) ) return false ; # Table
657 if ( count ( $this->tables ) > 0 && $this->nextis ( $a , "|" , false ) ) return false ; # Table
658 if ( count ( $this->tables ) > 0 && $this->nextis ( $a , "!" , false ) ) return false ; # Table
659 if ( $this->nextis ( $a , "=" , false ) ) return false ; # Heading
660 if ( $this->nextis ( $a , "----" , false ) ) return false ; # <hr>
661 }
662 $this->bold_italics = "" ;
663 return $this->once ( $a , $xml , "restofline" ) ;
664 }
665
666 function p_blankline ( &$a , &$xml )
667 {
668 if ( $this->nextis ( $a , "\n" ) ) return true ;
669 return false ;
670 }
671
672 function p_block_lines ( &$a , &$xml , $force = false )
673 {
674 $x = "" ;
675 $b = $a ;
676 if ( !$this->p_line ( $b , $x , $force ) ) return false ;
677 while ( $this->p_line ( $b , $x , false ) ) ;
678 while ( $this->p_blankline ( $b , $x ) ) ; # Skip coming blank lines
679 $xml .= "<paragraph>{$x}</paragraph>" ;
680 $a = $b ;
681 return true ;
682 }
683
684
685
686 # PRE block
687 # Parses a line starting with ' '
688 function p_preline ( &$a , &$xml )
689 {
690 if ( $a >= $this->wl ) return false ; # Already at the end of the text
691 if ( $this->w[$a]!= ' ' ) return false ; # Not a preline
692
693 $a++ ;
694 $this->bold_italics = "" ;
695 $x = "" ;
696 $ret = $this->once ( $a , $x , "restofline" ) ;
697 if ( $ret ) {
698 $xml .= "<preline>" . $x . "</preline>" ;
699 }
700 return $ret ;
701 }
702
703 # Parses a block of lines each starting with ' '
704 function p_block_pre ( &$a , &$xml )
705 {
706 $x = "" ;
707 $b = $a ;
708 if ( !$this->once ( $b , $x , "preline" , true , true ) ) return false ;
709 $this->once ( $b , $x , "blankline" , false , true ) ;
710 $xml .= "<preblock>{$x}</preblock>" ;
711 $a = $b ;
712 return true ;
713 }
714
715 # LIST block
716 # Returns a list tag depending on the wiki markup
717 function listtag ( $c , $open = true )
718 {
719 if ( !$open ) return "</list>" ;
720 $r = "" ;
721 if ( $c == '#' ) $r = "numbered" ;
722 if ( $c == '*' ) $r = "bullet" ;
723 if ( $c == ':' ) $r = "ident" ;
724 if ( $c == ';' ) $r = "def" ;
725 if ( $r != "" ) $r = " type='{$r}'" ;
726 $r = "<list{$r}>" ;
727 return $r ;
728 }
729
730 # Opens/closes list tags
731 function fixlist ( $last , $cur )
732 {
733 $r = "" ;
734 $olast = $last ;
735 $ocur = $cur ;
736 $ocommon = "" ;
737
738 # Remove matching parts
739 while ( $last != "" && $cur != "" && $last[0] == $cur[0] )
740 {
741 $ocommon = $cur[0] ;
742 $cur = substr ( $cur , 1 ) ;
743 $last = substr ( $last , 1 ) ;
744 }
745
746 # Close old tags
747 $fixitemtag = false ;
748 if ( $last != "" && $ocommon != "" ) $fixitemtag = true ;
749 while ( $last != "" )
750 {
751 $r .= "</listitem>" . $this->listtag ( substr ( $last , -1 ) , false ) ;
752 $last = substr ( $last , 0 , -1 ) ;
753 }
754 if ( $fixitemtag ) $r .= "</listitem><listitem>" ;
755
756 # Open new tags
757 while ( $cur != "" )
758 {
759 $r .= $this->listtag ( $cur[0] ) . "<listitem>" ;
760 $cur = substr ( $cur , 1 ) ;
761 }
762
763 return $r ;
764 }
765
766 # Parses a single list line
767 function p_list_line ( &$a , &$xml , &$last )
768 {
769 $cur = "" ;
770 do {
771 $lcur = $cur ;
772 while ( $this->nextis ( $a , "*" ) ) $cur .= "*" ;
773 while ( $this->nextis ( $a , "#" ) ) $cur .= "#" ;
774 while ( $this->nextis ( $a , ":" ) ) $cur .= ":" ;
775 while ( $this->nextis ( $a , ";" ) ) $cur .= ";" ;
776 } while ( $cur != $lcur ) ;
777
778 $unchanged = false ;
779# if ( substr ( $cur , 0 , strlen ( $last ) ) == $last ) $unchanged = true ;
780 if ( $last == $cur ) $unchanged = true ;
781 $xml .= $this->fixlist ( $last , $cur ) ;
782
783 if ( $cur == "" ) return false ; # Not a list line
784 $last = $cur ;
785 $this->skipblanks ( $a ) ;
786
787 if ( $unchanged ) $xml .= "</listitem><listitem>" ;
788 if ( $cur == ";" ) # Definition
789 {
790 $b = $a ;
791 while ( $b < $this->wl && $this->w[$b] != "\n" && $this->w[$b] != ':' ) $b++ ;
792 if ( $b >= $this->wl || $this->w[$b] == "\n" )
793 {
794 $xml .= "<defkey>" ;
795 $this->p_restofline ( $a , $xml ) ;
796 $xml .= "</defkey>" ;
797 }
798 else
799 {
800 $xml .= "<defkey>" ;
801 $this->w[$b] = "\n" ;
802 $this->p_restofline ( $a , $xml ) ;
803 $xml .= "</defkey>" ;
804 $xml .= "<defval>" ;
805 $this->p_restofline ( $a , $xml ) ;
806 $xml .= "</defval>" ;
807 }
808 }
809 else $this->p_restofline ( $a , $xml ) ;
810 return true ;
811 }
812
813 # Checks for a list block ( those nasty things starting with '*', '#', or the like...
814 function p_block_list ( &$a , &$xml )
815 {
816 $last = "" ;
817 $found = false ;
818 while ( $this->p_list_line ( $a , $xml , $last ) ) $found = true ;
819 return $found ;
820 }
821
822 # HTML
823 # This function detects a HTML tag, finds the matching close tag,
824 # parses everything in between, and returns everything as an extension.
825 # Returns false otherwise.
826 function p_html ( &$a , &$xml )
827 {
828 if ( !$this->nextis ( $a , "<" , false ) ) return false ;
829
830 $b = $a ;
831 $x = "" ;
832 $tag = "" ;
833 $closing = false ;
834 $selfclosing = false ;
835
836 if ( !$this->p_html_tag ( $b , $x , $tag , $closing , $selfclosing ) ) return false ;
837
838 if ( isset ( $this->directhtmltags[$tag] ) )
839 {
840 $tag_open = "<" . $this->directhtmltags[$tag] ;
841 $tag_close = "</" . $this->directhtmltags[$tag] . ">" ;
842 }
843 else
844 {
845 $tag_open = "<extension extension_name='{$tag}'" ;
846 $tag_close = "</extension>" ;
847 }
848
849 # Is this tag self-closing?
850 if ( $selfclosing )
851 {
852 $a = $b ;
853 $xml .= $tag_open . $x . ">" . $tag_close ;
854 return true ;
855 }
856
857 # Find the matching close tag
858 # TODO : The simple open/close counter should be replaced with a
859 # stack to allow for tolerating half-broken HTML,
860 # such as unclosed <li> tags
861 $begin = $b ;
862 $cnt = 1 ;
863 $tag2 = "" ;
864 while ( $cnt > 0 && $b < $this->wl )
865 {
866 $x2 = "" ;
867 $last = $b ;
868 if ( !$this->p_html_tag ( $b , $x2 , $tag2 , $closing , $selfclosing ) )
869 {
870 $dummy = "";
871 if ( $tag != "nowiki" && $this->w[$b] == '{' && $this->p_template ( $b , $dummy ) )
872 continue ;
873 $b++ ;
874 continue ;
875 }
876 if ( $tag != $tag2 ) continue ;
877 if ( $selfclosing ) continue ;
878 if ( $closing ) $cnt-- ;
879 else $cnt++ ;
880 }
881
882 if ( $cnt > 0 ) return false ; # Tag was never closed
883
884 # What happens in between?
885 $between = substr ( $this->w , $begin , $last - $begin ) ;
886
887 if ( $tag != "nowiki" && $tag != "math" )
888 {
889 if ( $tag == 'gallery' ) {
890 $this->gallery2wiki ( $between ) ;
891 $tag_open = "" ;
892 $tag_close = "" ;
893 }
894
895 # Parse the part in between the tags
896 $subparser = new wiki2xml ;
897 $subparser->inherit ( $this ) ;
898 $between2 = $subparser->parse ( $between ) ;
899
900 # Was the parsing correct?
901 if ( $between2 != $this->errormessage )
902 $between = $this->strip_single_paragraph ( $between2 ) ; # No <paragraph> for inline HTML tags
903 else
904 $between = htmlspecialchars ( $between ) ; # Incorrect markup, use safe wiki source instead
905 }
906 else $between = htmlspecialchars ( $between ) ; # No wiki parsing in here
907
908 $a = $b ;
909 if ( $tag_open != "" ) $xml .= $tag_open . $x . ">" ;
910 $xml .= $between ;
911 if ( $tag_close != "" ) $xml .= $tag_close ;
912 return true ;
913 }
914
915 /**
916 * Converts the lines within a <gallery> to wiki tables
917 */
918 function gallery2wiki ( &$text ) {
919 $lines = explode ( "\n" , trim ( $text ) ) ;
920 $text = "{| style='border-collapse: collapse; border: 1px solid grey;'\n" ;
921 $cnt = 0 ;
922 foreach ( $lines AS $line ) {
923 if ( $cnt >= 4 ) {
924 $cnt = 0 ;
925 $text .= "|--\n" ;
926 }
927 $a = explode ( "|" , $line , 2 ) ;
928 if ( count ( $a ) == 1 ) { # Generate caption from file name
929 $b = $a[0] ;
930 $b = explode ( ":" , $b , 2 ) ;
931 $b = array_pop ( $b ) ;
932 $b = explode ( "." , $b ) ;
933 array_pop ( $b ) ;
934 $a[] = implode ( "." , $b ) ;
935 }
936 $link = array_shift ( $a ) ;
937 $caption = array_pop ( $a ) ;
938 $text .= "|valign=top align=left|[[{$link}|thumb|center|]]<br/>{$caption}\n" ;
939 $cnt++ ;
940 }
941 $text .= "|}\n" ;
942 }
943
944 function strip_single_paragraph ( $s )
945 {
946 if ( substr_count ( $s , "paragraph>" ) == 2 &&
947 substr ( $s , 0 , 11 ) == "<paragraph>" &&
948 substr ( $s , -12 ) == "</paragraph>" )
949 $s = substr ( $s , 11 , -12 ) ;
950 return $s ;
951 }
952
953 # This function checks for and parses a HTML tag
954 # Only to be called from p_html, as it returns only a partial extension tag!
955 function p_html_tag ( &$a , &$xml , &$tag , &$closing , &$selfclosing )
956 {
957 if ( $this->w[$a] != '<' ) return false ;
958 $b = $a + 1 ;
959 $this->skipblanks ( $b ) ;
960 $tag = "" ;
961 $attrs = array () ;
962 if ( !$this->scanplaintext ( $b , $tag , array ( " " , ">" ) , array ( "\n" ) ) ) return false ;
963
964 $this->skipblanks ( $b ) ;
965 if ( $b >= $this->wl ) return false ;
966
967 $tag = trim ( strtolower ( $tag ) ) ;
968 $closing = false ;
969 $selfclosing = false ;
970
971 # Is closing tag?
972 if ( substr ( $tag , 0 , 1 ) == "/" )
973 {
974 $tag = substr ( $tag , 1 ) ;
975 $closing = true ;
976 $this->skipblanks ( $b ) ;
977 if ( $b >= $this->wl ) return false ;
978 }
979
980 if ( substr ( $tag , -1 ) == "/" )
981 {
982 $tag = substr ( $tag , 0 , -1 ) ;
983 $selfclosing = true ;
984 }
985
986 # Parsing attributes
987 $ob = $b ;
988 $q = "" ;
989 while ( $b < $this->wl && ( $q != "" || ( $this->w[$b] != '>' && $this->w[$b] != '/' ) ) ) {
990 if ( $this->w[$b] == '"' || $this->w[$b] == "'" ) {
991 if ( $q == "" ) $q = $this->w[$b] ;
992 else if ( $this->w[$b] == $q ) $q = "" ;
993 }
994 $b++ ;
995 }
996 if ( $b >= $this->wl ) return false ;
997 $attrs = $this->preparse_attributes ( substr ( $this->w , $ob , $b - $ob + 1 ) ) ;
998
999 # Is self closing?
1000 if ( $tag == 'br' ) $selfclosing = true ; # Always regard <br> as <br/>
1001 if ( $this->w[$b] == '/' )
1002 {
1003 $b++ ;
1004 $this->skipblanks ( $b ) ;
1005 $selfclosing = true ;
1006 }
1007
1008 $this->skipblanks ( $b ) ;
1009 if ( $b >= $this->wl ) return false ;
1010 if ( $this->w[$b] != '>' ) return false ;
1011
1012 $a = $b + 1 ;
1013 if ( count ( $attrs ) > 0 )
1014 {
1015 $xml = " " . implode ( " " , $attrs ) ;
1016 }
1017 return true ;
1018 }
1019
1020 # This function replaces templates and separates HTML attributes.
1021 # It is used for both HTML tags and wiki tables
1022 function preparse_attributes ( $x )
1023 {
1024 # Creating a temporary new parser to run the attribute list in
1025 $np = new wiki2xml ;
1026 $np->inherit ( $this ) ;
1027 $np->w = $x ;
1028 $np->wl = strlen ( $x ) ;
1029
1030 # Replacing templates, and '<' and '>' in parameters
1031 $c = 0 ;
1032 $q = "" ;
1033 while ( $q != "" || ( $c < $np->wl && $np->w[$c] != '>' && $np->w[$c] != '/' ) )
1034 {
1035 $y = $np->w[$c] ;
1036 if ( $np->nextis ( $c , "{{" , false ) ) {
1037 $xx = "" ;
1038 if ( $np->p_template ( $c , $xx ) ) continue ;
1039 else $c++ ;
1040 } else if ( $y == "'" || $y == '"' ) {
1041 if ( $q == "" ) $q = $y ;
1042 else if ( $y == $q ) $q = "" ;
1043 $c++ ;
1044 } else if ( $q != "" && ( $y == '<' || $y == '>' ) ) {
1045 $y = htmlentities ( $y ) ;
1046 $np->w = substr ( $np->w , 0 , $c ) . $y . substr ( $np->w , $c + 1 ) ;
1047 $np->wl += strlen ( $y ) - 1 ;
1048 } else $c++ ;
1049 if ( $c >= $np->wl ) return array () ;
1050 }
1051
1052 $attrs = array () ;
1053 $c = 0 ;
1054
1055 # Seeking attributes
1056 while ( $np->w[$c] != '>' && $np->w[$c] != '/' )
1057 {
1058 $attr = "" ;
1059 if ( !$np->p_html_attr ( $c , $attr ) ) break ;
1060 if ( $attr != "" ) {
1061 $key = array_shift ( explode ( "=" , $attr , 2 ) ) ;
1062 if ( !isset ( $attrs[$key] ) && substr ( $attr , -3 , 3 ) != '=""' ) {
1063 $attrs[$key] = $attr ;
1064 }
1065 }
1066 $np->skipblanks ( $c ) ;
1067 if ( $c >= $np->wl ) return array () ;
1068 }
1069 if ( substr ( $np->w , $c ) != ">" AND substr ( $np->w , $c ) != "/" ) return array() ;
1070
1071 return $attrs ;
1072 }
1073
1074
1075 # This function scans a single HTML tag attribute and returns it as <attr name='key'>value</attr>
1076 function p_html_attr ( &$a , &$xml )
1077 {
1078 $b = $a ;
1079 $this->skipblanks ( $b ) ;
1080 if ( $b >= $this->wl ) return false ;
1081 $name = "" ;
1082 if ( !$this->scanplaintext ( $b , $name , array ( " " , "=" , ">" , "/" ) , array ( "\n" ) ) ) return false ;
1083
1084 $this->skipblanks ( $b ) ;
1085 if ( $b >= $this->wl ) return false ;
1086 $name = trim ( strtolower ( $name ) ) ;
1087
1088 # Trying to catch illegal names; should be replaced with regexp
1089 $n2 = "" ;
1090 for ( $q = 0 ; $q < strlen ( $name ) ; $q++ ) {
1091 if ( $name[$q] == '_' OR ( $name[$q] >= 'a' AND $name[$q] <= 'z' ) )
1092 $n2 .= $name[$q] ;
1093 }
1094 $name = trim ( $n2 ) ;
1095 if ( $name == 'extension_name' ) return false ; # Not allowed, because used internally
1096 if ( $name == '' ) return false ;
1097
1098 # Determining value
1099 $value = "" ;
1100 if ( $this->w[$b] == "=" )
1101 {
1102 $b++ ;
1103 $this->skipblanks ( $b ) ;
1104 if ( $b >= $this->wl ) return false ;
1105 $q = "" ;
1106 $is_q = false ;
1107 if ( $this->w[$b] == '"' || $this->w[$b] == "'" )
1108 {
1109 $q = $this->w[$b] ;
1110 $b++ ;
1111 if ( $b >= $this->wl ) return false ;
1112 $is_q = true ;
1113 }
1114 while ( $b < $this->wl )
1115 {
1116 $c = $this->w[$b] ;
1117 if ( $c == $q )
1118 {
1119 $b++ ;
1120 if ( $is_q ) break ;
1121 return false ; # Broken attribute value
1122 }
1123 if ( $this->nextis ( $b , "\\{$q}" ) ) # Ignore escaped quotes
1124 {
1125 $value .= "\\{$q}" ;
1126 continue ;
1127 }
1128 if ( $c == "\n" ) return false ; # Line break before value end
1129 if ( !$is_q && ( $c == ' ' || $c == '>' || $c == '/' ) ) break ;
1130 $value .= htmlspecialchars ( $c ) ;
1131 $b++ ;
1132 }
1133 }
1134 if ( $name == "" ) return true ;
1135
1136 $a = $b ;
1137 if ( $q == "'" ) $q = "'" ;
1138 else $q = '"' ;
1139 $xml = "{$name}={$q}{$value}{$q}" ;
1140 #$xml .= "<attr name='{$name}'>{$value}</attr>" ;
1141 return true ;
1142 }
1143
1144 # Horizontal ruler (<hr> / ----)
1145 function p_hr ( &$a , &$xml )
1146 {
1147 if ( !$this->nextis ( $a , "----" ) ) return false ;
1148 $this->skipblanks ( $a , "-" ) ;
1149 $this->skipblanks ( $a ) ;
1150 $xml .= "<hr/>" ;
1151 return true ;
1152 }
1153
1154 # TABLE
1155 # Scans the rest of the line as HTML attributes and returns the usual <attrs><attr> string
1156 function scanattributes ( &$a )
1157 {
1158 $x = "" ;
1159 while ( $a < $this->wl )
1160 {
1161 if ( $this->w[$a] == "\n" ) break ;
1162 $x .= $this->w[$a] ;
1163 $a++ ;
1164 }
1165 $x .= ">" ;
1166
1167 $attrs = $this->preparse_attributes ( $x ) ;
1168
1169 $ret = "" ;
1170 if ( count ( $attrs ) > 0 )
1171 {
1172 #$ret .= "<attrs>" ;
1173 $ret .= " " . implode ( " " , $attrs ) ;
1174 #$ret .= "</attrs>" ;
1175 }
1176 return $ret ;
1177 }
1178
1179 # Finds the first of the given items; does *not* alter $a
1180 function scanahead ( $a , $matches )
1181 {
1182 while ( $a < $this->wl )
1183 {
1184 foreach ( $matches AS $x )
1185 {
1186 if ( $this->nextis ( $a , $x , false ) )
1187 {
1188 return $a ;
1189 }
1190 }
1191 $a++ ;
1192 }
1193 return -1 ; # Not found
1194 }
1195
1196
1197 # The main table parsing function
1198 function p_table ( &$a , &$xml )
1199 {
1200 if ( $a >= $this->wl ) return false ;
1201 $c = $this->w[$a] ;
1202 if ( $c == "{" && $this->nextis ( $a , "{|" , false ) )
1203 return $this->p_table_open ( $a , $xml ) ;
1204
1205# print "p_table for " . htmlentities ( substr ( $this->w , $a ) ) . "<br/><br/>" ; flush () ;
1206
1207 if ( count ( $this->tables ) == 0 ) return false ; # No tables open, nothing to do
1208
1209 # Compatability for table cell lines starting with blanks; *evil MediaWiki parser!*
1210 $b = $a ;
1211 $this->skipblanks ( $b ) ;
1212 if ( $b >= $this->wl ) return false ;
1213 $c = $this->w[$b] ;
1214
1215 if ( $c != "|" && $c != "!" ) return false ; # No possible table markup
1216
1217 if ( $c == "|" && $this->nextis ( $b , "|}" , false ) ) return $this->p_table_close ( $b , $xml ) ;
1218
1219 #if ( $this->nextis ( $a , "|" , false ) || $this->nextis ( $a , "!" , false ) )
1220 return $this->p_table_element ( $b , $xml , true ) ;
1221 }
1222
1223 function lasttable ()
1224 {
1225 return $this->tables[count($this->tables)-1] ;
1226 }
1227
1228 # Returns the attributes for table cells
1229 function tryfindparams ( &$a )
1230 {
1231 $n = strspn ( $this->w , $this->allowed , $a ) ; # PHP 4.3.0 and above
1232# $n = strspn ( substr ( $this->w , $a ) , $this->allowed ) ; # PHP < 4.3.0
1233 if ( $n == 0 ) return "" ; # None found
1234
1235 $b = $a + $n ;
1236 if ( $b >= $this->wl ) return "" ;
1237 if ( $this->w[$b] != "|" && $this->w[$b] != "!" ) return "" ;
1238 if ( $this->nextis ( $b , "||" , false ) ) return "" ; # Reached a ||, so return blank string
1239 if ( $this->nextis ( $b , "!!" , false ) ) return "" ; # Reached a ||, so return blank string
1240 $this->w[$b] = "\n" ;
1241 $ret = $this->scanattributes ( $a ) ;
1242 $this->w[$b] = "|" ;
1243 $a = $b + 1 ;
1244 return $ret ;
1245 }
1246
1247 function p_table_element ( &$a , &$xml , $newline = false )
1248 {
1249# print "p_table_element for " . htmlentities ( substr ( $this->w , $a ) ) . "<br/><br/>" ; flush () ;
1250 $b = $a ;
1251 $this->skipblanks ( $b ) ; # Compatability for table cells starting with blanks; *evil MediaWiki parser!*
1252 if ( $b >= $this->wl ) return false ; # End of the game
1253 $x = "" ;
1254 if ( $newline && $this->nextis ( $b , "|-" ) ) # Table row
1255 {
1256 $this->skipblanks ( $b , "-" ) ;
1257 $this->skipblanks ( $b ) ;
1258
1259 $attrs = $this->scanattributes ( $b ) ;
1260 if ( $this->tables[count($this->tables)-1]->is_row_open ) $x .= "</tablerow>" ;
1261 else $this->tables[count($this->tables)-1]->is_row_open = true ;
1262 $this->tables[count($this->tables)-1]->had_row = true ;
1263 $x .= "<tablerow{$attrs}>" ;
1264 $y = "" ;
1265 $this->p_restofcell ( $b , $y ) ;
1266 }
1267 else if ( $newline && $this->nextis ( $b , "|+" ) ) # Table caption
1268 {
1269 $this->skipblanks ( $b ) ;
1270 $attrs = $this->tryfindparams ( $b ) ;
1271 $this->skipblanks ( $b ) ;
1272 if ( $this->tables[count($this->tables)-1]->is_row_open ) $x .= "</tablerow>" ;
1273 $this->tables[count($this->tables)-1]->is_row_open = false ;
1274
1275 $y = "" ;
1276 if ( !$this->p_restofcell ( $b , $y ) ) return false ;
1277 $x .= "<tablecaption{$attrs}>{$y}</tablecaption>" ;
1278 }
1279 else # TD or TH
1280 {
1281 $c = $this->w[$b] ;
1282 $b++ ;
1283 $tag = "error" ;
1284 if ( $c == '|' ) $tag = "tablecell" ;
1285 else if ( $c == '!' ) $tag = "tablehead" ;
1286 $attrs = $this->tryfindparams ( $b ) ;
1287 $this->skipblanks ( $b ) ;
1288 if ( !$this->p_restofcell ( $b , $x ) ) return false ;
1289
1290 if ( substr ( $x , 0 , 1 ) == "|" ) # Crude fix to compensate for MediaWiki "tolerant" parsing
1291 $x = substr ( $x , 1 ) ;
1292 $x = "<{$tag}{$attrs}>{$x}</{$tag}>" ;
1293 $this->tables[count($this->tables)-1]->had_cell = true ;
1294 if ( !$this->tables[count($this->tables)-1]->is_row_open )
1295 {
1296 $this->tables[count($this->tables)-1]->is_row_open = true ;
1297 $this->tables[count($this->tables)-1]->had_row = true ;
1298 $x = "<tablerow>{$x}" ;
1299 }
1300 }
1301
1302 $a = $b ;
1303 $xml .= $x ;
1304 return true ;
1305 }
1306
1307 # Finds the substring that composes the table cell,
1308 # then runs a new parser on it
1309 function p_restofcell ( &$a , &$xml )
1310 {
1311 # Get substring for cell
1312 $b = $a ;
1313 $sameline = true ;
1314 $x = "" ;
1315 $itables = 0 ;
1316 while ( $b < $this->wl )
1317 {
1318 $c = $this->w[$b] ;
1319 if ( $c == "<" && $this->once ( $b , $x , "html" ) ) continue ; # Up front to catch pre and nowiki
1320 if ( $c == "\n" ) { $sameline = false ; }
1321 if ( $c == "\n" && $this->nextis ( $b , "\n{|" ) ) { $itables++ ; continue ; }
1322 if ( $c == "\n" && $itables > 0 && $this->nextis ( $b , "\n|}" ) ) { $itables-- ; continue ; }
1323
1324 if ( ( $c == "\n" && $this->nextis ( $b , "\n|" , false ) ) OR
1325 ( $c == "\n" && $this->nextis ( $b , "\n!" , false ) ) OR
1326 ( $c == "\n" && $this->nextis ( $b , "\n |" , false ) ) OR # MediaWiki parser madness compensator
1327 ( $c == "\n" && $this->nextis ( $b , "\n !" , false ) ) OR # MediaWiki parser madness compensator
1328 ( $c == "|" && $sameline && $this->nextis ( $b , "||" , false ) ) OR
1329 ( $c == "!" && $sameline && $this->nextis ( $b , "!!" , false ) ) )
1330 {
1331 if ( $itables == 0 ) break ;
1332 $b += 2 ;
1333 }
1334
1335 if ( $c == "[" && $this->once ( $b , $x , "internal_link" ) ) continue ;
1336 if ( $c == "{" && $this->once ( $b , $x , "template_variable" ) ) continue ;
1337 if ( $c == "{" && $this->once ( $b , $x , "template" ) ) continue ;
1338 $b++ ;
1339 }
1340
1341# if ( $itables > 0 ) return false ;
1342
1343 # Parse cell substring
1344 $s = substr ( $this->w , $a , $b - $a ) ;
1345 $p = new wiki2xml ;
1346 $p->inherit ( $this ) ;
1347 $x = $p->parse ( $s ) ;
1348 if ( $x == $this->errormessage ) return false ;
1349
1350 $a = $b + 1 ;
1351 $xml .= $this->strip_single_paragraph ( $x ) ;
1352 return true ;
1353 }
1354
1355 function p_table_close ( &$a , &$xml )
1356 {
1357 if ( count ( $this->tables ) == 0 ) return false ;
1358 $b = $a ;
1359 if ( !$this->nextis ( $b , "|}" ) ) return false ;
1360 if ( !$this->tables[count($this->tables)-1]->had_row ) return false ; # Table but no row was used
1361 if ( !$this->tables[count($this->tables)-1]->had_cell ) return false ; # Table but no cell was used
1362 $x = "" ;
1363 if ( $this->tables[count($this->tables)-1]->is_row_open ) $x .= "</tablerow>" ;
1364 unset ( $this->tables[count($this->tables)-1] ) ;
1365 $x .= "</table>" ;
1366 $xml .= $x ;
1367 $a = $b ;
1368 while ( $this->nextis ( $a , "\n" ) ) ;
1369 return true ;
1370 }
1371
1372 function p_table_open ( &$a , &$xml )
1373 {
1374 $b = $a ;
1375 if ( !$this->nextis ( $b , "{|" ) ) return false ;
1376
1377 $this->is_row_open = false ;
1378
1379 # Add table to stack
1380 $nt->is_row_open = false ;
1381 $nt->had_row = false ;
1382 $nt->had_cell = false ;
1383 $this->tables[count($this->tables)] = $nt ;
1384
1385 $x = "<table" ;
1386 $x .= $this->scanattributes ( $b ) . ">" ;
1387 while ( $this->nextis ( $b , "\n" ) ) ;
1388
1389 while ( !$this->p_table_close ( $b , $x ) )
1390 {
1391 if ( $b >= $this->wl )
1392 {
1393 unset ( $this->tables[count($this->tables)-1] ) ;
1394 return false ;
1395 }
1396 if ( $this->p_table_open ( $b , $x ) ) continue ;
1397 if ( !$this->p_table_element ( $b , $x , true ) ) # No |} and no table element
1398 {
1399 unset ( $this->tables[count($this->tables)-1] ) ;
1400 return false ;
1401 }
1402 }
1403 $a = $b ;
1404 $xml .= $x ;
1405 return true ;
1406 }
1407
1408 #-----------------------------------
1409 # Parse the article
1410 function p_article ( &$a , &$xml )
1411 {
1412 $x = "" ;
1413 $b = $a ;
1414 while ( $b < $this->wl )
1415 {
1416 if ( $this->onceormore ( $b , $x , "heading" ) ) continue ;
1417 if ( $this->onceormore ( $b , $x , "block_lines" ) ) continue ;
1418 if ( $this->onceormore ( $b , $x , "block_pre" ) ) continue ;
1419 if ( $this->onceormore ( $b , $x , "block_list" ) ) continue ;
1420 if ( $this->onceormore ( $b , $x , "hr" ) ) continue ;
1421 if ( $this->onceormore ( $b , $x , "table" ) ) continue ;
1422 if ( $this->onceormore ( $b , $x , "blankline" ) ) continue ;
1423 if ( $this->p_block_lines ( $b , $x , true ) ) continue ;
1424 # The last resort! It should never come to this!
1425 if ( !$this->compensate_markup_errors ) $xml .= "<error type='general' reason='no matching markup'/>" ;
1426 $xml .= htmlspecialchars ( $this->w[$b] ) ;
1427 $b++ ;
1428 }
1429 $a = $b ;
1430 $xml .= $x ;
1431
1432# asort ( $this->profile ) ;
1433# $p = "" ;
1434# foreach ( $this->profile AS $k => $v ) $p .= "<p>{$k} : {$v}</p>" ;
1435# $xml = "<debug>{$this->cnt}{$p}</debug>" . $xml ;
1436 return true ;
1437 }
1438
1439 # The only function to be called directly from outside the class
1440 function parse ( &$wiki )
1441 {
1442 $this->w = rtrim ( $wiki ) ;
1443
1444 # Fix line endings
1445 $cc = count_chars ( $wiki , 0 ) ;
1446 if ( $cc[10] > 0 && $cc[13] == 0 )
1447 $this->w = str_replace ( "\r" , "\n" , $this->w ) ;
1448 $this->w = str_replace ( "\r" , "" , $this->w ) ;
1449
1450 # Remove HTML comments
1451# $this->w = str_replace ( "\n<!--" , "<!--" , $this->w ) ;
1452 # Important: Do not remove leading \n, since it could be a heading delimiter
1453 $this->w= preg_replace('/\n<!--(.|\s)*?-->\n/', "\n<!-- --> ", $this->w);
1454 $this->w= preg_replace('/<!--(.|\s)*?-->/', '', $this->w);
1455 $this->w= preg_replace('/<!--(.|\s)*$/', '', $this->w);
1456
1457 # Run the thing!
1458# $this->tables = array () ;
1459 $this->wl = strlen ( $this->w ) ;
1460 $xml = "" ;
1461 $a = 0 ;
1462 if ( !$this->p_article ( $a , $xml ) ) return $this->errormessage ;
1463
1464 # XML cleanup
1465 $ol = -1 ;
1466 while ( $ol != strlen ( $xml ) ) {
1467 $ol = strlen ( $xml ) ;
1468 $xml = str_replace ( "<preline> " , "<preline><space/>" , $xml ) ;
1469 $xml = str_replace ( "<space/> " , "<space/><space/>" , $xml ) ;
1470 }
1471 $ol = -1 ;
1472 while ( $ol != strlen ( $xml ) ) {
1473 $ol = strlen ( $xml ) ;
1474 $xml = str_replace ( " " , " " , $xml ) ;
1475 }
1476 $ol = -1 ;
1477 while ( $this->use_space_tag && $ol != strlen ( $xml ) ) {
1478 $ol = strlen ( $xml ) ;
1479 $xml = str_replace ( "> " , "><space/>" , $xml ) ;
1480 $xml = str_replace ( " <" , "<space/><" , $xml ) ;
1481 }
1482 $xml = str_replace ( '<tablerow></tablerow>' , '' , $xml ) ;
1483
1484 return $xml ;
1485 }
1486
1487 }
1488
1489?>
Note: See TracBrowser for help on using the repository browser.