source: gs2-extensions/ngramj/src/wiki/wiki2xml/php/xml2docbook_xml.php@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 14.6 KB
Line 
1<?php
2
3/**
4 * This file contains the /element/ class needed by xml2tree.php
5 * to create a tree which is then converted into DocBook XML
6 */
7
8class element {
9 var $name = '';
10 var $attrs = array ();
11 var $children = array ();
12
13 # Temporary variables for link tags
14 var $link_target = "" ;
15 var $link_trail = "" ;
16 var $link_parts = array () ;
17
18 # Variables only used by $tree root
19 var $list = array () ;
20 var $opentags = array () ;
21 var $sections = array () ;
22
23 /**
24 * Parse the children ... why won't anybody think of the children?
25 */
26 function sub_parse(& $tree) {
27 $ret = '' ;
28 $temp = "" ;
29 foreach ($this->children as $key => $child) {
30 if (is_string($child)) {
31 $temp .= $child ;
32 } elseif ($child->name != 'ATTRS') {
33 $ret .= $this->add_temp_text ( $temp ) ;
34 $sub = $child->parse ( $tree , "" , $this ) ;
35 if ( $this->name == 'LINK' ) {
36 if ( $child->name == 'TARGET' ) $this->link_target = $sub ;
37 else if ( $child->name == 'PART' ) $this->link_parts[] = $sub ;
38 else if ( $child->name == 'TRAIL' ) $this->link_trail = $sub ;
39 }
40 $ret .= $sub ;
41 }
42 }
43 return $ret . $this->add_temp_text ( $temp ) ;
44 }
45
46 function fix_text ( $s ) {
47 $s = html_entity_decode ( $s , ENT_COMPAT, 'UTF-8') ; // dbu 2007-08-20
48 filter_named_entities ( $s ) ;
49 $s = str_replace ( "&" , "&amp;" , $s ) ;
50 $s = str_replace ( "<" , "&lt;" , $s ) ;
51 $s = str_replace ( ">" , "&gt;" , $s ) ;
52 return $s; // utf8_decode ( $s ) ; // dbu 2007-08-20
53 }
54
55 function add_temp_text ( &$temp ) {
56 $s = $temp ;
57 $temp = "" ;
58 return $this->fix_text ( $s ) ;
59 }
60
61 function add_new ( $tag , &$tree ) {
62 return $this->ensure_new ( $tag , $tree , "<{$tag}>\n" ) ;
63 }
64
65 function ensure_new ( $tag , &$tree , $opttag = "" ) {
66 if ( $opttag == "" ) { # Catching special case (currently, <section>)
67 foreach ( $tree->opentags AS $o ) {
68 if ( $o == $tag ) return "" ; # Already open
69 }
70 }
71 array_push ( $tree->opentags , $tag ) ;
72 if ( $opttag == "" ) return "<{$tag}>\n" ;
73 else return $opttag ;
74 }
75
76 function close_last ( $tag , &$tree , $all = false ) {
77 $found = false ;
78 foreach ( $tree->opentags AS $o ) {
79 if ( $o == $tag ) $found = true ;
80 }
81 if ( !$found ) return "" ; # Already closed
82 $ret = "\n" ;
83 while ( count ( $tree->opentags ) > 0 ) {
84 $o = array_pop ( $tree->opentags ) ;
85 $ret .= "</{$o}>\n" ;
86 if ( $o == $tag ) {
87 if ( $all ) return $ret . $this->close_last ( $tag , $tree , true ) ;
88 else return $ret ;
89 }
90 }
91 }
92
93 function handle_extensions ( &$tree ) {
94 global $content_provider ;
95 $sub = "" ;
96 $name = strtolower ( $this->attrs['EXTENSION_NAME'] ) ;
97 $ot = $tree->opentags ;
98 $tree->opentags = array () ;
99 if ( $name == 'ref' )
100 $sub .= $this->ensure_new ( 'para' , $tree ) ;
101 $sub .= $this->sub_parse ( $tree ) ;
102 while ( count ( $tree->opentags ) > 0 )
103 $sub .= "</" . array_pop ( $tree->opentags ) . ">\n" ;
104 $tree->opentags = $ot ;
105 if ( $name == 'ref' ) {
106 $ret = '<footnote>' . $sub . '</footnote>' ;
107 } else {
108 $ret = $sub ;
109 }
110 return $ret ;
111 }
112
113 function internal_id ( $title ) {
114 #return urlencode ( $title ) ;
115 $ret = "" ;
116 for ( $a = 0 ; $a < strlen ( $title ) ; $a++ ) {
117 if ( ( $title[$a] >= 'A' && $title[$a] <= 'Z' ) ||
118 ( $title[$a] >= 'a' && $title[$a] <= 'z' ) ||
119 ( $title[$a] >= '0' && $title[$a] <= '9' ) )
120 $ret .= $title[$a] ;
121 else $ret .= "_" ;
122 }
123 return $ret ;
124 }
125
126 function handle_link ( &$tree ) {
127 global $content_provider ;
128 $ot = $tree->opentags ;
129 $sub = $this->sub_parse ( $tree ) ;
130 $tree->opentags = $ot ;
131 $link = "" ;
132 if ( isset ( $this->attrs['TYPE'] ) AND strtolower ( $this->attrs['TYPE'] ) == 'external' ) { # External link
133 $href = htmlentities ( $this->attrs['HREF'] ) ;
134 if ( trim ( $sub ) == "" ) {
135 $sub = $href ;
136 $sub = explode ( '://' , $sub , 2 ) ;
137 $sub = explode ( '/' , array_pop ( $sub ) , 2 ) ;
138 $sub = array_shift ( $sub ) ;
139 }
140 $sub = $this->fix_text ( $sub ) ;
141 $link = "<ulink url=\"{$href}\"><citetitle>{$sub}</citetitle></ulink>" ;
142 } else { # Internal link
143 if ( count ( $this->link_parts ) > 0 ) {
144 $link = array_pop ( $this->link_parts ) ;
145 array_push ( $this->link_parts , $link ) ; # Compensating array_pop
146 }
147 $link_text = $link ;
148 if ( $link == "" ) $link = $this->link_target ;
149 $link .= $this->link_trail ;
150
151 $ns = $content_provider->get_namespace_id ( $this->link_target ) ;
152
153
154 if ( $ns == 6 ) { # Image
155 $nstext = explode ( ":" , $this->link_target , 2 ) ;
156 $target = array_pop ( $nstext ) ;
157 $nstext = array_shift ( $nstext ) ;
158
159 $text = array_pop ( $this->link_parts ) ;
160 $is_thumb = false ;
161 $align = '' ;
162 $width = '' ;
163 foreach ( $this->link_parts AS $s ) {
164 $s = trim ( $s ) ;
165 if ( $s == 'thumb' ) {
166 $is_thumb = true ;
167 if ( $align == '' ) $align = 'right' ;
168 if ( $width == '' ) $width = '200px' ;
169 }
170 }
171
172 $href = $content_provider->get_image_url ( $target ) ;
173
174 $link = "<mediaobject>\n<imageobject>\n<imagedata" ;
175 $link .= " fileref=\"{$href}\"" ;
176# if ( $align != '' ) $link .= " align='{$align}'" ; # Deactivated until DocBook supports floating images; meanwhile:
177 if ( $align == 'center' ) $link .= " align='{$align}'" ;
178 if ( $width != '' ) $link .= " width='$width' scalefit='1'" ; # depth='$width'
179 $link .= "/>\n</imageobject>\n" ;
180 $link .= "<textobject>\n" ;
181 $link .= "<phrase>{$text}</phrase>\n" ;
182 $link .= "</textobject>\n" ;
183 if ( $is_thumb ) {
184 $link .= "<caption>\n" ;
185 if ( substr ( $text , 0 , 5 ) == '<para' ) $link .= $text ; # Para-noia!
186 else $link .= "<para>{$text}</para>\n" ;
187 $link .= "</caption>\n" ;
188 }
189 $link .= "</mediaobject>\n" ;
190 } else if ( $ns == -9 ) { # Interlanguage link
191 $sub = $this->link_target ;
192 $nstext = explode ( ":" , $sub , 2 ) ;
193 $name = array_pop ( $nstext ) ;
194 $nstext = array_shift ( $nstext ) ;
195
196 $href = "http://{$nstext}.wikipedia.org/wiki/" . htmlentities ( $name ) ;
197 $link = "<ulink url=\"{$href}\"><citetitle>{$sub}</citetitle></ulink>" ;
198 } else if ( $ns == -8 ) { # Category link
199 if ( $link_text == "!" || $link_text == '*' ) $link = "" ;
200 else $link = " ({$link})" ;
201 $link = "" . $this->link_target . $link . "" ;
202 } else {
203 if ( $content_provider->is_an_article ( $this->link_target ) ) {
204 $lt = $this->internal_id ( trim ( $this->link_target ) ) ;
205 $lt = str_replace ( "+" , "_" , $lt ) ;
206 $link = "<link linkend='{$lt}'>{$link}</link>" ;
207 } else {
208 #$link = "<link linkend='{$lt}'>{$link}</link>" ;
209 }
210 }
211 }
212 return $link ;
213 }
214
215 function make_tgroup ( &$tree ) {
216 $num_rows = 0 ;
217 $max_num_cols = 0 ;
218 $caption = "" ;
219 foreach ($this->children AS $key1 => $row) {
220 if (is_string($row)) continue ;
221 elseif ($row->name == 'TABLECAPTION') {
222 $caption .= $row->parse ( $tree , "DOCAPTION" , $this ) ;
223 continue ;
224 } elseif ($row->name != 'TABLEROW') continue ;
225 $num_rows++ ;
226 $num_cols = 0 ;
227 foreach ( $row->children AS $key2 => $col ) {
228 if (is_string($col)) continue ;
229 if ($col->name != 'TABLECELL' && $col->name != 'TABLEHEAD') continue ;
230 if ( isset ( $col->attrs['COLSPAN'] ) ) $num_cols += $col->attrs['COLSPAN'] ;
231 else $num_cols++ ;
232 }
233 if ( $num_cols > $max_num_cols )
234 $max_num_cols = $num_cols ;
235 }
236 return "<title>{$caption}</title><tgroup cols='{$max_num_cols}'>" ;
237 }
238
239 function top_tag ( &$tree ) {
240 if ( count ( $tree->opentags ) == 0 ) return "" ;
241 $x = array_pop ( $tree->opentags ) ;
242 array_push ( $tree->opentags , $x ) ;
243 return $x ;
244 }
245
246 function convert_xhtml_tags ( &$oldtag , &$tree , &$ret ) {
247 if ( substr ( $oldtag , 0 , 6 ) != 'XHTML:' )
248 return false ;
249
250 $tag = substr ( $oldtag , 6 ) ;
251 if ( $tag == 'UL' || $tag == 'OL' ) {
252 $ot = $tree->opentags ;
253 $r = "" ;
254 $found = false ;
255 while ( count ( $ot ) > 0 ) {
256 $x = array_pop ( $ot ) ;
257 $r .= "</{$x}>\n" ;
258 $found = true ;
259 if ( $x == 'para' ) break ;
260# if ( $x == 'listitem' ) break ;
261 $found = false ;
262 }
263 if ( !$found ) return false ;
264 $tree->opentags = $ot ;
265 if ( $tag == 'UL' ) $this->attrs['TYPE'] = "bullet" ;
266 if ( $tag == 'OL' ) $this->attrs['TYPE'] = "numbered" ;
267 $oldtag = 'LIST' ;
268 $ret .= $r ;
269 return true ;
270 } else if ( $tag == 'LI' ) {
271# $tt = $this->top_tag ( $tree ) ;
272# print $tt . "<br/>" ;
273# if ( $tt != 'itemizedlist' && $tt != 'orderedlist' ) return false ;
274 $oldtag = 'LISTITEM' ;
275 }
276
277 return false ; # No match
278 }
279
280 /*
281 * Parse the tag
282 */
283 function parse ( &$tree , $param = "" , $root = "" ) {
284 global $content_provider ;
285 $ret = '';
286 $tag = $this->name ;
287 $close_tag = "" ;
288
289 # Pre-fixing XHTML to wiki tags
290 $xhtml_conversion = $this->convert_xhtml_tags ( $tag , $tree , $ret ) ;
291
292 if ( $tag == 'SPACE' ) {
293 return ' ' ; # Speedup
294 } else if ( $tag == 'ARTICLES' ) {
295 # dummy, to prevent default action to be called
296 } else if ( $tag == 'AUTHORS' ) {
297 # dummy, to prevent default action to be called
298 } else if ( $tag == 'AUTHOR' ) {
299 add_author ( $this->sub_parse ( $tree ) ) ;
300 return "" ;
301 } else if ( $tag == 'ARTICLE' ) {
302 $title = isset ( $this->attrs["TITLE"] ) ? $this->attrs["TITLE"] : "Untiteled" ;
303 $id = $this->internal_id ( $title ) ;
304 $ret .= "<article id='{$id}'>\n";
305 $ret .= "<title>" . urldecode ( $title ) . "</title>\n" ;
306 } else if ( $tag == 'LINK' ) {
307 return $this->handle_link ( $tree ) ; # Shortcut
308 } else if ( $tag == 'EXTENSION' ) {
309 return $this->handle_extensions ( $tree ) ; # Shortcut
310 } else if ( $tag == 'HEADING' ) {
311 $level = count ( $tree->sections ) ;
312 $wanted = $this->attrs["LEVEL"] ;
313 $ret .= $this->close_last ( "para" , $tree ) ;
314 while ( $level >= $wanted ) {
315 $x = array_pop ( $tree->sections ) ;
316 if ( $x == 1 ) {
317 $ret .= $this->close_last ( "section" , $tree ) ;
318 }
319 $level-- ;
320 }
321 while ( $level < $wanted ) {
322 $level++ ;
323 if ( $level < $wanted ) {
324 array_push ( $tree->sections , 0 ) ;
325 } else {
326 $ret .= $this->ensure_new ( "section" , $tree , "<section>" ) ;
327 array_push ( $tree->sections , 1 ) ;
328 }
329 }
330 $ret .= "<title>" ;
331 } else if ( $tag == 'PARAGRAPH' || $tag == 'XHTML:P' ) { # Paragraph
332 $ret .= $this->close_last ( "para" , $tree ) ;
333 $ret .= $this->ensure_new ( "para" , $tree ) ;
334 } else if ( $tag == 'LIST' ) { # List
335 $ret .= $this->close_last ( "para" , $tree ) ;
336 $list_type = strtolower ( $this->attrs['TYPE'] ) ;
337 if ( $list_type == 'bullet' || $list_type == 'ident' || $list_type == 'def' ) $ret .= '<itemizedlist mark="opencircle">' ;
338 else if ( $list_type == 'numbered' ) $ret .= '<orderedlist numeration="arabic">' ;
339 } else if ( $tag == 'LISTITEM' ) { # List item
340 $ret .= $this->close_last ( "para" , $tree ) ;
341 $ret .= "<listitem>\n" ;
342 $ret .= $this->ensure_new ( "para" , $tree ) ;
343
344
345 } else if ( $tag == 'TABLE' ) { # Table
346 $ret .= $this->add_new ( "table" , $tree ) ;
347# $ret .= "<title></title>" ;
348 $ret .= $this->make_tgroup ( $tree ) ;
349 $ret .= "<tbody>" ;
350 } else if ( $tag == 'TABLEROW' ) { # Tablerow
351 $retl_before = strlen ( $ret ) ;
352 $ret .= $this->add_new ( "row" , $tree ) ;
353 $retl_after = strlen ( trim ( $ret ) ) ;
354 } else if ( $tag == 'TABLEHEAD' ) { # Tablehead
355 $ret .= $this->add_new ( "entry" , $tree ) ;
356 } else if ( $tag == 'TABLECELL' ) { # Tablecell
357 $old_ret = $ret ;
358 $ret .= $this->add_new ( "entry" , $tree ) ;
359 } else if ( $tag == 'TABLECAPTION' ) { # Tablecaption
360 if ( $param != "DOCAPTION" ) return "" ;
361# $ret .= $this->add_new ( "title" , $tree ) ;
362
363
364 } else if ( $tag == 'BOLD' || $tag == 'XHTML:STRONG' || $tag == 'XHTML:B' ) { # <b> or '''
365 $ret .= $this->ensure_new ( "para" , $tree ) ;
366 $ret .= '<emphasis role="bold">' ;
367 $close_tag = "emphasis" ;
368 } else if ( $tag == 'ITALICS' || $tag == 'XHTML:EM' || $tag == 'XHTML:I' ) { # <i> or ''
369 $ret .= $this->ensure_new ( "para" , $tree ) ;
370 $ret .= '<emphasis>' ;
371 $close_tag = "emphasis" ;
372 } else if ( $tag == 'XHTML:TT' ) { # <tt>
373 $ret .= $this->ensure_new ( "para" , $tree ) ;
374 $ret .= '<literal>' ;
375 $close_tag = "literal" ;
376 } else if ( $tag == 'XHTML:SUB' ) { # <sub>
377 $ret .= $this->ensure_new ( "para" , $tree ) ;
378 $ret .= '<subscript>' ;
379 $close_tag = "subscript" ;
380 } else if ( $tag == 'XHTML:SUP' ) { # <sup>
381 $ret .= $this->ensure_new ( "para" , $tree ) ;
382 $ret .= '<superscript>' ;
383 $close_tag = "superscript" ;
384 } else if ( $tag == 'XHTML:SUP' ) { # <sup>
385 $ret .= $this->ensure_new ( "para" , $tree ) ;
386 $ret .= '<superscript>' ;
387 $close_tag = "superscript" ;
388 } else if ( $tag == 'PRELINE' OR $tag == 'XHTML:PRE' ) { # <pre>
389 $ret .= $this->ensure_new ( "para" , $tree ) ;
390 $ret .= '<programlisting>' ;
391 $close_tag = "programlisting" ;
392 } else if ( $tag == 'DEFVAL' ) {
393 $ret .= $this->ensure_new ( "para" , $tree ) ;
394 $ret .= " : " ;
395 } else { # Default : normal text
396 $ret .= $this->ensure_new ( "para" , $tree ) ;
397 }
398
399
400
401 # Get the sub-items
402 $length_between = strlen ( $ret ) ;
403 if ( $tag != 'MAGIC_VARIABLE' && $tag != 'TEMPLATE' ) {
404 $ret .= $this->sub_parse ( $tree ) ;
405 }
406 $length_between = strlen ( $ret ) - $length_between ;
407
408
409
410 # Close tags
411 if ( $tag == 'LIST' ) {
412 $ret .= $this->close_last ( "para" , $tree ) ;
413 if ( $list_type == 'bullet' || $list_type == 'ident' || $list_type == 'def' ) $ret .= "</itemizedlist>\n" ;
414 else if ( $list_type == 'numbered' ) $ret .= "</orderedlist>\n" ;
415 if ( $xhtml_conversion )
416 $ret .= $this->ensure_new ( "para" , $tree ) ;
417 } else if ( $tag == 'LISTITEM' ) {
418 $ret .= $this->close_last ( "para" , $tree ) ;
419 $ret .= "</listitem>\n" ;
420 } else if ( $close_tag != "" ) {
421 $ret .= "</{$close_tag}>" ;
422 } else if ( $tag == 'HEADING' ) {
423 $ret .= "</title>\n" ;
424
425
426 } else if ( $tag == 'TABLE' ) { # Table
427 $ret .= "</tbody>" ;
428 $ret .= "</tgroup>" ;
429 $ret .= $this->close_last ( "table" , $tree ) ;
430 } else if ( $tag == 'TABLEROW' ) { # Tablerow
431 if ( strlen ( trim ( $ret ) ) == $retl_after ) {
432 $ret = substr ( $ret , 0 , $retl_before ) ;
433 $this->close_last ( "row" , $tree ) ;
434 } else $ret .= $this->close_last ( "row" , $tree ) ;
435 } else if ( $tag == 'TABLEHEAD' ) { # Tablehead !!!!
436 $ret .= $this->close_last ( "entry" , $tree ) ;
437 } else if ( $tag == 'TABLECELL' ) { # Tablecell
438 $ret .= $this->close_last ( "entry" , $tree ) ;
439# if ( $length_between == 0 ) $ret = $old_ret ;
440 } else if ( $tag == 'TABLECAPTION' ) { # Tablecaption
441# $ret .= $this->close_last ( "title" , $tree ) ;
442
443
444 } else if ( $tag == 'ARTICLE' ) {
445 $ret .= $this->close_last ( "section" , $tree , true ) ;
446 $ret .= $this->close_last ( "para" , $tree ) ;
447 $ret .= "</article>";
448 }
449
450 return $ret;
451 }
452}
453
454require_once ( "xml2tree.php" ) ; # Uses the "element" class defined above
455
456?>
Note: See TracBrowser for help on using the repository browser.