source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/xml2xhtml.php@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 16.9 KB
Line 
1<?php
2
3# Setting allowed XHTML construct list
4global $xhtml_allowed ;
5$xhtml_inline = "a,b,br,cite,code,em,font,i,img,small,strong,span,sub,sup,tt,var,";
6$xhtml_block = "blockquote,div,dl,h1,h2,h3,h4,h5,h6,hr,ol,p,pre,table,ul,dd,dl2,center,";
7$xhtml_allowed = array ( # A => B means B allowed in A
8 '' => $xhtml_block,
9 'p' => $xhtml_inline."table,pre",
10 'pre' => $xhtml_inline,
11 'center' => $xhtml_inline,
12 'table' => 'caption,col,colgroup,thead,tfoot,tbody,tr',
13 'tbody' => 'tr',
14 'tr' => 'td,th',
15 'td' => $xhtml_inline.$xhtml_block,
16 'th' => $xhtml_inline.$xhtml_block,
17 'caption' => $xhtml_inline,
18 'dl2' => 'dt',
19 'listitem' => 'defkey,defval',
20 'ul' => 'li',
21 'ol' => 'li',
22 'dl' => 'dt,dd',
23 'li' => $xhtml_inline.$xhtml_block,
24 'dt' => $xhtml_inline,
25 'dd' => $xhtml_inline.$xhtml_block,
26 'h1' => $xhtml_inline,
27 'h2' => $xhtml_inline,
28 'h3' => $xhtml_inline,
29 'h4' => $xhtml_inline,
30 'h5' => $xhtml_inline,
31 'h6' => $xhtml_inline,
32 'font' => $xhtml_inline,
33 'div' => $xhtml_inline.$xhtml_block,
34 'blockquote' => $xhtml_block,
35) ;
36
37$xhtml_allowed['caption'] .= $xhtml_allowed['p'] ;
38$xhtml_allowed['li'] .= $xhtml_allowed['p'] ;
39
40foreach ( $xhtml_allowed As $k => $v ) {
41 $xhtml_allowed[$k] = explode ( ',' , $v ) ;
42}
43
44
45# The class
46class XML2XHTML {
47 var $s = "" ;
48 var $tags = array () ;
49 var $ignore_counter = 0 ;
50 var $links = array () ;
51
52 function fix_text ( $s , $replace_amp = false ) {
53/* $s = html_entity_decode ( $s ) ;
54 filter_named_entities ( $s ) ;
55 $s = str_replace ( "&" , "&amp;" , $s ) ;
56 $s = str_replace ( "<" , "&lt;" , $s ) ;
57 $s = str_replace ( ">" , "&gt;" , $s ) ;
58 return utf8_decode ( $s ) ;*/
59 filter_named_entities ( $s ) ;
60 if ( $replace_amp ) $s = str_replace ( "&" , "&amp;" , $s ) ;
61 $s = str_replace ( "<" , "&lt;" , $s ) ;
62 $s = str_replace ( ">" , "&gt;" , $s ) ;
63 return $s ;
64 }
65
66 function add ( $t ) { # Can be altered, e.g. for direct output (echo)
67 $this->s .= $t ;
68 }
69
70 function is_allowed ( $tag , $base = "" ) {
71 global $xhtml_allowed ;
72 if ( $tag == "" ) return false ;
73 if ( $base == "" ) {
74 $o = $this->top_tag () ;
75 $base = $o->tag ;
76 }
77 if ( !isset ( $xhtml_allowed[$base] ) ) return false ;
78 return in_array ( $tag , $xhtml_allowed[$base] ) ;
79 }
80
81 function filter_evil_attributes ( $tag , &$attrs ) {
82 if ( count ( $attrs ) == 0 ) return "" ;
83 $ret = "" ;
84 foreach ( $attrs AS $k => $v ) {
85 $ret .= " " . strtolower ( $k ) . '="' . str_replace ( '"' , '\"' , $v ) . '"' ;
86 }
87 return $ret ;
88 }
89
90 function add_tag ( $tag , $attrs = array () , $bogus = false ) {
91 $o->tag = $tag ;
92 $o->really_open = $this->is_allowed ( $tag ) ;
93 if ( $bogus ) $o->really_open = false ;
94 $o->close_with_previous = false ;
95 $this->tags[] = $o ;
96 if ( $o->really_open ) $this->add ( "<{$tag}" . $this->filter_evil_attributes ( $tag , $attrs ) . ">" ) ;
97 }
98
99 function close_tag ( $tag ) {
100 if ( $tag == '' ) return ;
101 if ( count ( $this->tags ) == 0 ) die ( "CLOSING NON-OPEN TAG \"{$tag}\" (empty list)" ) ;
102 $x = array_pop ( $this->tags ) ;
103 if ( $tag != $x->tag ) die ( "CLOSING {$tag} instead of {$x->tag}" ) ;
104 if ( $x->really_open ) $this->add ( "</{$x->tag}>" ) ;
105
106 # Auto-close previous?
107 $o = $this->top_tag() ;
108 if ( $o->close_with_previous ) {
109 $this->close_tag ( $o->tag ) ;
110 }
111 }
112
113 function insist_on ( $tag ) {
114 global $xhtml_allowed ;
115 $o = $this->top_tag () ;
116 if ( $o->tag == $tag ) return ; # Everything OK
117
118 foreach ( $xhtml_allowed AS $k => $v ) {
119 if ( $o->tag != $k ) continue ;
120 if ( in_array ( $tag , $v ) ) return ; # Everything OK
121 }
122
123 $o->tag = $tag ;
124 $o->really_open = true ;
125 $o->close_with_previous = true ;
126 $this->tags[] = $o ;
127 $this->add ( "<{$tag}>" ) ;
128 }
129
130 function top_tag () {
131 if ( count ( $this->tags ) == 0 ) {
132 $o->tag = "" ;
133 $o->really_open = false ;
134 $o->close_with_previous = false ;
135 return $o ;
136 }
137 $x = array_pop ( $this->tags ) ;
138 $this->tags[] = $x ;
139 return $x ;
140 }
141
142
143
144
145 function tag_extension ( $open , &$attrs ) {
146 if( !defined( 'MEDIAWIKI' ) ) return ; # Only as MediaWiki extension
147
148 if ( $open ) {
149 $this->extension_name = $attrs['EXTENSION_NAME'] ;
150 $this->extension_attrs = $attrs ;
151 unset ( $this->extension_attrs['EXTENSION_NAME'] ) ;
152 $this->extension_text_before = $this->s ;
153 $this->s = "" ;
154 } else {
155 $extension_text = trim ( $this->s ) ;
156 $this->s = $this->extension_text_before ;
157 $this->extension_text_before = "" ;
158
159 global $wgParser , $wgTitle ;
160 if ( !isset ( $wgParser ) ) return ; # Paranoia
161 if ( !isset ( $wgParser->mTagHooks[$this->extension_name] ) ) return ; # Extension has no handler
162
163 if ( $extension_text == "" ) $extension_text = "<{$this->extension_name}/>" ;
164 else $extension_text = "<{$this->extension_name}>{$extension_text}</{$this->extension_name}>" ;
165 $options = new ParserOptions ;
166
167 $s = $wgParser->parse ( $extension_text , $wgTitle , $options , false ) ;
168 $this->add ( $s->getText() ) ;
169 }
170 }
171
172
173 function tag_paragraph ( $open , &$attrs ) {
174 global $xmlg ;
175 if ( !isset ( $attrs['align'] ) AND $xmlg['xhtml_justify'] ) $attrs['align'] = 'justify' ;
176 if ( $open ) $this->add_tag ( "p" , $attrs ) ;
177 else $this->close_tag ( "p" ) ;
178 }
179
180 function tag_space ( $open , &$attrs ) {
181 if ( $open ) $this->add ( " " ) ;
182 }
183
184 # SIMPLE TAGS
185
186 function simple_tag ( $open , $tag ) {
187 if ( $open ) $this->add_tag ( $tag ) ;
188 else $this->close_tag ( $tag ) ;
189 }
190
191 function tag_bold ( $open , &$attrs ) { global $xmlg ; $this->simple_tag ( $open , $xmlg['xhtml_logical_markup'] ? "strong" : "b" ) ; }
192 function tag_xhtml_b ( $open , &$attrs ) { $this->simple_tag ( $open , "b" ) ; }
193 function tag_xhtml_strong ( $open , &$attrs ) { $this->simple_tag ( $open , "strong" ) ;}
194 function tag_italics ( $open , &$attrs ) { global $xmlg ; $this->simple_tag ( $open , $xmlg['xhtml_logical_markup'] ? "em" : "i" ) ; }
195 function tag_xhtml_i ( $open , &$attrs ) { $this->simple_tag ( $open , "i" ) ; }
196 function tag_xhtml_em ( $open , &$attrs ) { $this->simple_tag ( $open , "em" ) ; }
197 function tag_xhtml_ol ( $open , &$attrs ) { $this->simple_tag ( $open , "ol" ) ; }
198 function tag_xhtml_ul ( $open , &$attrs ) { $this->simple_tag ( $open , "ul" ) ; }
199 function tag_xhtml_li ( $open , &$attrs ) { $this->simple_tag ( $open , "li" ) ; }
200 function tag_xhtml_dt ( $open , &$attrs ) { $this->simple_tag ( $open , "dt" ) ; }
201 function tag_xhtml_dl ( $open , &$attrs ) { $this->simple_tag ( $open , "dl" ) ; }
202 function tag_xhtml_dd ( $open , &$attrs ) { $this->simple_tag ( $open , "dd" ) ; }
203 function tag_xhtml_code ( $open , &$attrs ) { $this->simple_tag ( $open , "code" ) ; }
204 function tag_xhtml_pre ( $open , &$attrs ) { $this->simple_tag ( $open , "pre" ) ; }
205 function tag_preblock ( $open , &$attrs ) { $this->simple_tag ( $open , "pre" ) ; }
206 function tag_preline ( $open , &$attrs ) { if ( !$open ) $this->add ( "\n" ) ; }
207
208 # MISC
209 function tag_xhtml_font ( $open , &$attrs ) {
210 if ( $open ) $this->add_tag ( "font" , $attrs ) ;
211 else $this->close_tag ( "font" ) ;
212 }
213
214 function tag_defkey ( $open , &$attrs ) {
215 $this->simple_tag ( $open , "defkey" ) ;
216 }
217
218 function tag_list ( $open , &$attrs ) {
219 if ( !$open ) {
220 $o = $this->top_tag () ;
221 $this->close_tag ( $o->tag ) ;
222 return ;
223 }
224 $type = $attrs['TYPE'] ;
225 if ( $type == 'bullet' ) {
226 $this->tag_xhtml_ul ( $open , $attrs ) ;
227 } else if ( $type == 'numbered' ) {
228 $this->tag_xhtml_ol ( $open , $attrs ) ;
229 } else if ( $type == 'ident' ) {
230 $this->tag_xhtml_dl ( $open , $attrs ) ;
231 } else if ( $type == 'def' ) {
232 $this->simple_tag ( $open , "dl2" ) ;
233 $this->s = str_replace ( 'dl2>' , 'dl>' , $this->s ) ;
234 } else return ;
235 }
236
237# function tag_defkey ( $open , &$attrs ) {
238# if ( $open ) array_pop ( $this->tags ) ; # Remove dd
239# $this->tag_xhtml_dt ( $open , $attrs ) ;
240# }
241
242 function tag_listitem ( $open , &$attrs ) {
243 $o = $this->top_tag() ;
244 if ( !$open ) {
245 $this->close_tag ( $o->tag ) ;
246 return ;
247 }
248 if ( $o->tag == 'dt' ) $this->tag_xhtml_dt ( $open , $attrs ) ;
249 else if ( $o->tag == 'dl' ) $this->tag_xhtml_dd ( $open , $attrs ) ;
250 else if ( $o->tag == 'dl2' ) $this->tag_xhtml_dt ( $open , $attrs ) ;
251 else $this->tag_xhtml_li ( $open , $attrs ) ;
252 }
253
254 # HTML
255 function tag_xhtml_div ( $open , &$attrs ) {
256 if ( $open ) $this->add_tag ( "div" , $attrs ) ;
257 else $this->close_tag ( "div" ) ;
258 }
259
260 function tag_xhtml_span ( $open , &$attrs ) {
261 if ( $open ) $this->add_tag ( "div" , $attrs ) ;
262 else $this->close_tag ( "div" ) ;
263 }
264
265 # LINKS
266 function make_internal_link ( &$o ) {
267 global $content_provider ;
268 $text = $o->text ;
269 if ( $text == "" ) $text = $o->target ;
270 $text .= $o->trail ;
271 $ns = $content_provider->get_namespace_id ( $o->target ) ;
272
273 if ( $ns == 6 ) { # Image
274 if ( !$content_provider->do_show_images () ) {
275 return ;
276 }
277 $nstext = explode ( ":" , $o->target , 2 ) ;
278 $target = array_pop ( $nstext ) ;
279 $href = $content_provider->get_image_url ( $target ) ;
280
281 list($i_width, $i_height, $i_type, $i_attr) = @getimagesize($href);
282 if ( $i_width <= 0 ) { # Paranoia
283 $i_width = 100 ;
284 $i_height = 100 ;
285 }
286
287 $width = "" ;
288 $align = "" ;
289 $is_thumb = false ;
290 foreach ( $o->parts AS $p ) {
291 $p = strtolower ( trim ( $p ) ) ;
292 if ( $p == 'thumb' ) {
293 $is_thumb = true ;
294 if ( $align == '' ) $align = 'right' ;
295 if ( $width == '' ) $width = '200' ;
296 } else if ( $p == 'right' || $p == 'center' || $p == 'left' ) {
297 $align = $p ;
298 } else if ( substr ( $p , -2 , 2 ) == 'px' ) {
299 $width = trim ( substr ( $p , 0 , -2 ) ) ;
300 }
301 }
302
303 if ( $width == '' ) {
304 $size = "" ;
305 $divwidth = "" ;
306 } else {
307 $height = ( $i_height * $width ) / $i_width ;
308 $size = " width='{$width}' height='{$height}'" ;
309 $divwidth = $width + 2 ;
310 $divwidth = ";width={$divwidth}" ;
311 }
312
313 $s = "" ;
314 $image_page = $content_provider->get_full_url ( $o->target ) ;
315 if ( $is_thumb ) $s .= '<div class="thumb tright"><div style="' . $divwidth . '">' ;
316 else if ( $align != '' ) $s .= "<div style='float:{$align}{$divwidth}'>" ;
317 $s .= '<a href="' . $image_page . '" title="' . $text . '" class="internal">' ;
318 $s .= "<img src='{$href}'{$size} alt=\"{$text}\" longdesc=\"{$image_page}\"/>" ;
319 $s .= '</a>' ;
320 if ( $is_thumb ) {
321 $s .= '<div class="thumbcaption">' ;
322 $s .= '<div class="magnify" style="float:right">' ;
323 $s .= '<a href="' . $image_page . '" class="internal" title="enlarge">' ;
324 $s .= '<img src="http://en.wikipedia.org/skins-1.5/common/images/magnify-clip.png" width="15" height="11" alt="enlarge" />' ;
325 $s .= '</a>' ;
326 $s .= "</div>" ;
327 $s .= $text ;
328 $s .= "</div>" ;
329 }
330 if ( $is_thumb || $align != '' ) $s .= "</div>" ;
331 if ( $is_thumb ) $s .= "</div>" ;
332 $this->add ( $s ) ;
333
334 } else if ( $ns == -8 ) { # Category link
335 if ( !$content_provider->get_var ( 'keep_categories' ) ) return ;
336 } else if ( $ns == -9 ) { # Interlanguage link
337 if ( !$content_provider->get_var ( 'keep_interlanguage' ) ) return ;
338 } else { # Internal link
339 $this->add ( $content_provider->get_internal_link ( $o->target , $text ) ) ; # For now
340 }
341 }
342
343 function tag_link ( $open , &$attrs ) {
344 if ( $open ) {
345 $o->trail = "" ;
346 $o->parts = array () ;
347 $o->target = "" ;
348 $o->type = 'internal' ;
349 $o->href = "" ;
350 $o->text = "" ;
351 if ( isset ( $attrs['TYPE'] ) ) $o->type = $attrs['TYPE'] ;
352 if ( isset ( $attrs['HREF'] ) ) $o->href = $attrs['HREF'] ;
353 $o->s = $this->s ;
354 $this->s = "" ;
355 $this->links[] = $o ;
356 } else {
357 $o = array_pop ( $this->links ) ;
358 $text = $this->s ;
359 $this->s = $o->s ;
360 if ( count ( $o->parts ) > 0 ) $o->text = array_pop ( $o->parts ) ;
361 if ( $o->type == 'internal' ) {
362 $this->make_internal_link ( $o ) ;
363 } else {
364 $this->add ( '<a href="' . $o->href . '">' . $text . '</a>' ) ;
365 }
366 }
367 }
368
369 function tag_target ( $open , &$attrs ) {
370 if ( $open ) return ;
371 $o = array_pop ( $this->links ) ;
372 $o->target = $this->s ;
373 $this->s = "" ;
374 $this->links[] = $o ;
375 }
376
377 function tag_part ( $open , &$attrs ) {
378 if ( $open ) return ;
379 $o = array_pop ( $this->links ) ;
380 $o->parts[] = $this->s ;
381 $this->s = "" ;
382 $this->links[] = $o ;
383 }
384
385 function tag_trail ( $open , &$attrs ) {
386 if ( $open ) return ;
387 $o = array_pop ( $this->links ) ;
388 $o->trail = $this->s ;
389 $this->s = "" ;
390 $this->links[] = $o ;
391 }
392
393
394 # IGNORE TAGS
395 function ignore ( $open ) {
396 if ( $open ) $this->ignore_counter++ ;
397 else $this->ignore_counter-- ;
398 }
399
400 function tag_template ( $open , &$attrs ) { $this->ignore ( $open ) ; }
401 function tag_templatevar ( $open , &$attrs ) { $this->ignore ( $open ) ; }
402 function tag_magic_variable ( $open , &$attrs ) { $this->ignore ( $open ) ; }
403
404 # HEADINGS
405 function tag_heading ( $open , &$attrs , $level = "" ) {
406 if ( $level == "" ) $level = $attrs['LEVEL'] ;
407 if ( $level > 6 ) $level = 6 ; # Paranoia
408 if ( $open ) {
409 $this->add_tag ( "h{$level}" ) ;
410 } else {
411 $o = $this->top_tag() ;
412 $this->close_tag ( $o->tag ) ;
413 }
414 }
415
416 function tag_xhtml_h1 ( $open , &$attrs ) { $this->tag_heading ( $open , $attrs , '1' ) ; }
417 function tag_xhtml_h2 ( $open , &$attrs ) { $this->tag_heading ( $open , $attrs , '2' ) ; }
418 function tag_xhtml_h3 ( $open , &$attrs ) { $this->tag_heading ( $open , $attrs , '3' ) ; }
419 function tag_xhtml_h4 ( $open , &$attrs ) { $this->tag_heading ( $open , $attrs , '4' ) ; }
420 function tag_xhtml_h5 ( $open , &$attrs ) { $this->tag_heading ( $open , $attrs , '5' ) ; }
421 function tag_xhtml_h6 ( $open , &$attrs ) { $this->tag_heading ( $open , $attrs , '6' ) ; }
422
423 # TABLES
424 function tag_table ( $open , &$attrs ) {
425 $o = $this->top_tag() ;
426 if ( $o->tag == "p" && $o->really_open ) {
427 $this->close_tag ( 'p' ) ;
428 $this->add_tag ( "p" , array() , true ) ;
429 }
430 if ( $open ) {
431 $this->add_tag ( "table" , $attrs ) ;
432 } else {
433 $this->close_tag ( "table" ) ;
434 }
435 }
436
437 function tag_tablecaption ( $open , &$attrs ) {
438 if ( $open ) {
439 $this->insist_on ( "table" ) ;
440 $this->add_tag ( "caption" , $attrs ) ;
441 } else {
442 $this->close_tag ( "caption" ) ;
443 }
444 }
445
446 function tag_tablerow ( $open , &$attrs ) {
447 if ( $open ) {
448 $this->insist_on ( "table" ) ;
449 $this->add_tag ( "tr" , $attrs ) ;
450 } else {
451 $this->close_tag ( "tr" ) ;
452 }
453 }
454
455 function tag_tablecell ( $open , &$attrs ) {
456 if ( $open ) {
457 $this->insist_on ( "tr" ) ;
458 $this->add_tag ( "td" , $attrs ) ;
459 } else {
460 $this->close_tag ( "td" ) ;
461 }
462 }
463
464 function tag_tablehead ( $open , &$attrs ) {
465 if ( $open ) {
466 $this->insist_on ( "tr" ) ;
467 $this->add_tag ( "th" , $attrs ) ;
468 } else {
469 $this->close_tag ( "th" ) ;
470 }
471 }
472
473 function tag_xhtml_table ( $open , &$attrs ) { $this->tag_table ( $open , $attrs ) ; }
474 function tag_xhtml_tr ( $open , &$attrs ) { $this->tag_tablerow ( $open , $attrs ) ; }
475 function tag_xhtml_td ( $open , &$attrs ) { $this->tag_tablecell ( $open , $attrs ) ; }
476 function tag_xhtml_th ( $open , &$attrs ) { $this->tag_tablehead ( $open , $attrs ) ; }
477 function tag_xhtml_caption ( $open , &$attrs ) { $this->tag_tablecaption ( $open , $attrs ) ; }
478
479 function tag_article ( $open , &$attrs ) {
480 if ( !$open ) return ;
481 if ( !isset ( $attrs['TITLE'] ) ) return ;
482 $this->add_tag ( "h1" ) ;
483 $this->add ( urldecode ( $attrs['TITLE'] ) ) ;
484 $this->close_tag ( "h1" ) ;
485 }
486
487
488}
489
490
491# Global functions for parsing
492
493function XML2XHTML_START($parser, $name, $attrs) {
494 global $xml2xhtml ;
495 $name = strtolower ( $name ) ;
496 $function = 'tag_' . str_replace ( ':' , '_' , $name ) ;
497 if ( method_exists ( $xml2xhtml , $function ) ) {
498 $xml2xhtml->$function ( true , $attrs ) ;
499 } else {
500 }
501}
502
503function XML2XHTML_END($parser, $name) {
504 global $xml2xhtml ;
505 $name = strtolower ( $name ) ;
506 $function = 'tag_' . str_replace ( ':' , '_' , $name ) ;
507 if ( method_exists ( $xml2xhtml , $function ) ) {
508 $xml2xhtml->$function ( false , $attrs ) ;
509 } else {
510 }
511}
512
513function XML2XHTML_DATA ( $parser, $data ) {
514 global $xml2xhtml ;
515 if ( $xml2xhtml->ignore_counter > 0 ) return ;
516 $xml2xhtml->s .= $xml2xhtml->fix_text ( $data ) ;
517}
518
519function convert_xml_xhtml ( &$xml ) {
520 global $xml2xhtml ;
521 $xml2xhtml = new XML2XHTML ;
522 $xml_parser_handle = xml_parser_create();
523 xml_set_element_handler($xml_parser_handle, "XML2XHTML_START", "XML2XHTML_END");
524 xml_set_character_data_handler($xml_parser_handle, "XML2XHTML_DATA");
525
526
527 if ( is_array ( $xml ) ) {
528 xml_parse($xml_parser_handle, xml_articles_header() , false) ;
529
530 while ( $x = xml_shift ( $xml ) ) {
531 xml_parse($xml_parser_handle, $x, false) ;
532 }
533
534
535 xml_parse($xml_parser_handle, '</articles>', true) ;
536 } else {
537
538 xml_parse($xml_parser_handle, xml_articles_header(), false ) ;
539 xml_parse($xml_parser_handle, $xml) ;
540 xml_parse($xml_parser_handle, '</articles>' ) ;
541 }
542
543
544
545# if (!($parse_handle = fopen($xml_filename, 'r'))) {
546# die("FEHLER: Datei $xml_filename nicht gefunden.");
547# }
548
549# while ($xml_data = fread($parse_handle, 4096)) {
550# if (!xml_parse($xml_parser_handle, $xml_data, feof($parse_handle))) {
551# die(sprintf('XML error: %s at line %d',
552# xml_error_string(xml_get_error_code($xml_parser_handle)),
553# xml_get_current_line_number($xml_parser_handle)));
554# }
555# }
556
557 xml_parser_free($xml_parser_handle);
558}
559
560?>
Note: See TracBrowser for help on using the repository browser.