source: gs2-extensions/ngramj/src/wiki/wiki2xml/WIKI2XML.cpp@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 16.2 KB
Line 
1#include "WIKI2XML.h"
2
3TTableInfo::TTableInfo ()
4 {
5 tr_open = false ;
6 td_open = false ;
7 }
8
9string TTableInfo::close ()
10 {
11 string ret ;
12 if ( td_open ) ret += "</wikitablecell>" ;
13 if ( tr_open ) ret += "</wikitablerow>" ;
14 ret += "</wikitable>" ;
15 return ret ;
16 }
17
18string TTableInfo::new_row ()
19 {
20 string ret ;
21 if ( td_open ) ret += "</wikitablecell>" ;
22 if ( tr_open ) ret += "</wikitablerow>" ;
23 ret += "<wikitablerow>" ;
24 td_open = false ;
25 tr_open = true ;
26 return ret ;
27 }
28
29string TTableInfo::new_cell ( string type )
30 {
31 string ret ;
32 if ( !tr_open ) ret += new_row () ;
33 if ( td_open ) ret += "</wikitablecell>" ;
34 ret += "<wikitablecell type=\"" + upper ( type ) + "\">" ;
35 td_type = type ;
36 td_open = true ;
37 return ret ;
38 }
39
40// *****************************************************************************
41// *****************************************************************************
42//
43// WIKI2XML
44//
45// *****************************************************************************
46// *****************************************************************************
47
48void WIKI2XML::parse_symmetric ( string &l , int &from ,
49 string s1 , string s2 ,
50 string r1 , string r2 ,
51 bool extend )
52 {
53 int a , b ;
54 if ( !submatch ( l , s1 , from ) ) return ; // Left does not match
55 for ( a = from + s1.length() ; a + s2.length() <= l.length() ; a++ )
56 {
57 if ( !submatch ( l , s2 , a ) ) continue ;
58 for ( b = a+1 ; extend && submatch ( l , s2 , b ) ; b++ ) ;
59 b-- ;
60 l = l.substr ( 0 , from ) +
61 r1 +
62 l.substr ( from + s1.length() , b - from - s1.length() ) +
63 r2 +
64 l.substr ( b + s2.length() , l.length() ) ;
65 if ( debug ) cout << "newl : " << l << endl ;
66 break ;
67 }
68 }
69
70void WIKI2XML::parse_link ( string &l , int &from , char mode )
71 {
72 from += 1 ;
73 int a , cnt = 1 ;
74 chart par_open = '[' ; // mode 'L'
75 chart par_close = ']' ; // mode 'L'
76 if ( mode == 'T' ) { par_open = '{' ; par_close = '}' ; }
77 for ( a = from ; cnt > 0 && a+1 < l.length() ; a++ )
78 {
79 if ( l[a] == par_open && l[a+1] == par_open )
80 parse_link ( l , a ) ;
81 else if ( l[a] == par_close && l[a+1] == par_close )
82 cnt-- ;
83 }
84 if ( cnt > 0 ) return ; // Not a valid link
85
86 int to = a-1 ; // Without "]]"
87 string link = l.substr ( from+1 , to-from-1 ) ;
88
89 TXML x ;
90 vector <string> parts ;
91 explode ( '|' , link , parts ) ;
92 if ( mode == 'L' )
93 {
94 x.name = "wikilink" ;
95 x.add_key_value ( "type" , "internal" ) ;
96 }
97 else if ( mode == 'T' ) x.name = "wikitemplate" ;
98
99 for ( a = 0 ; a < parts.size() ; a++ )
100 {
101 bool last = ( a + 1 == parts.size() ) ;
102 string p = parts[a] ;
103 parse_line_sub ( p ) ;
104
105 if ( a > 0 && ( mode != 'L' || !last ) )
106 {
107 string key , value ;
108 vector <string> subparts ;
109 explode ( '=' , p , subparts ) ;
110 if ( subparts.size() == 1 )
111 {
112 value = xml_embed ( p , "value" ) ;
113 }
114 else
115 {
116 key = xml_embed ( subparts[0] , "key" ) ;
117 subparts.erase ( subparts.begin() ) ;
118 value = xml_embed ( implode ( "=" , subparts ) , "value" ) ;
119 }
120 p = key + value ;
121 }
122 else p = xml_embed ( p , "value" ) ;
123
124 string param = "number=\"" + val ( a ) + "\"" ;
125 if ( last ) param += " last=\"1\"" ;
126 x.text += xml_embed ( p , "wikiparameter" , param ) ;
127 }
128
129 if ( mode == 'L' ) // Try link trail
130 {
131 string trail ;
132 for ( a = to+2 ; a < l.length() && is_text_char ( l[a] ) ; a++ )
133 trail += l[a] ;
134 to = a-2 ;
135 if ( trail != "" ) x.text += xml_embed ( trail , "trail" ) ;
136 }
137
138 x.add_key_value ( "parameters" , val ( parts.size() ) ) ;
139 string replacement = x.get_string () ;
140 parse_line_sub ( replacement ) ;
141
142 l.erase ( from-1 , to-from+3 ) ;
143 l.insert ( from-1 , replacement ) ;
144 if ( debug ) cout << "Link : " << link << endl << "Replacement : " << replacement << endl ;
145 if ( debug ) cout << "Result : " << l << endl << endl ;
146 from = from + replacement.length() - 2 ;
147 }
148
149bool WIKI2XML::is_list_char ( chart c ) // For now...
150 {
151 if ( c == '*' ) return true ;
152 if ( c == '#' ) return true ;
153 if ( c == ':' ) return true ;
154 return false ;
155 }
156
157string WIKI2XML::get_list_tag ( chart c , bool open )
158 {
159 string ret ;
160 if ( debug ) cout << "get_list_tag : " << c << endl ;
161 if ( c == '*' ) ret = "ul" ;
162 if ( c == '#' ) ret = "ol" ;
163 if ( c == ':' ) ret = "dl" ;
164 if ( ret != "" )
165 {
166 string itemname = "li" ;
167 if ( c == ':' ) itemname = "dd" ;
168 if ( open ) ret = "<" + ret + "><" + itemname + ">" ;
169 else ret = "</" + itemname + "></" + ret + ">" ;
170 }
171 return ret ;
172 }
173
174string WIKI2XML::fix_list ( string &l )
175 {
176 int a , b ;
177 for ( a = 0 ; a < l.length() && is_list_char ( l[a] ) ; a++ ) ;
178 string newlist , pre ;
179 if ( a > 0 )
180 {
181 newlist = left ( l , a ) ;
182 while ( a < l.length() && l[a] == ' ' ) a++ ; // Removing leading blanks
183 l = l.substr ( a , l.length() ) ;
184 }
185 if ( debug ) cout << "fix_list : " << l << endl ;
186 if ( list == "" && newlist == "" ) return "" ;
187 for ( a = 0 ; a < list.length() &&
188 a < newlist.length() &&
189 list[a] == newlist[a] ; a++ ) ; // The common part, if any
190
191 for ( b = a ; b < list.length() ; b++ )
192 pre = get_list_tag ( list[b] , false ) + pre ; // Close old list tags
193 for ( b = a ; b < newlist.length() ; b++ )
194 pre += get_list_tag ( newlist[b] , true ) ; // Open new ones
195
196 if ( debug ) cout << "pre : " << pre << endl ;
197 if ( debug ) cout << "newlist : " << newlist << endl ;
198 list = newlist ;
199 return pre ;
200 }
201
202void WIKI2XML::parse_line ( string &l )
203 {
204 int a , b ;
205 if ( debug ) cout << l << endl ;
206 string pre ;
207 string oldlist = list ;
208 pre += fix_list ( l ) ;
209 if ( list != "" && list == oldlist )
210 {
211 string itemname = "li" ;
212 if ( right ( list , 1 ) == ":" ) itemname = "dd" ;
213 pre = "</" + itemname + "><" + itemname + ">" + pre ;
214 }
215
216 if ( l == "" ) // Paragraph
217 {
218 l = "<p/>" ;
219 }
220 else if ( left ( l , 4 ) == "----" ) // <hr>
221 {
222 for ( a = 0 ; a < l.length() && l[a] == l[0] ; a++ ) ;
223 pre += "<wikiurlcounter action=\"reset\"/><hr/>" ;
224 l = l.substr ( a , l.length() - a ) ;
225 }
226 else if ( l != "" && l[0] == '=' ) // Heading
227 {
228 for ( a = 0 ; a < l.length() && l[a] == '=' && l[l.length()-a-1] == '=' ; a++ ) ;
229 string h = "h0" ;
230 if ( a >= l.length() ) h = "" ; // No heading
231// else if ( l[a] != ' ' ) h = "" ;
232// else if ( l[l.length()-a-1] != ' ' ) h = "" ;
233 else if ( a < 1 || a > 9 ) h = "" ;
234 if ( h != "" )
235 {
236 l = l.substr ( a , l.length() - a*2 ) ;
237 h[1] += a ;
238 l = xml_embed ( l , h ) ;
239 }
240 }
241 else if ( l != "" && l[0] == ' ' ) // Pre-formatted text
242 {
243 for ( a = 0 ; a < l.length() && l[a] == ' ' ; a++ ) ;
244 l = l.substr ( a , l.length() ) ;
245 if ( l != "" )
246 {
247 pre += "<pre>" + l + "</pre>" ;
248 l = "" ;
249 }
250 }
251 else if ( left ( l , 2 ) == "{|" || left ( l , 2 ) == "|}" ||
252 ( tables.size() > 0 && l != "" && ( l[0] == '|' || l[0] == '!' ) ) )
253 {
254 pre += table_markup ( l ) ;
255 l = "" ;
256 }
257
258
259 if ( l != "" ) parse_line_sub ( l ) ;
260
261 if ( pre != "" ) l = pre + l ;
262 }
263
264bool WIKI2XML::is_external_link_protocol ( string protocol )
265 {
266 if ( protocol == "HTTP" ) return true ;
267 if ( protocol == "FTP" ) return true ;
268 if ( protocol == "MAILTO" ) return true ;
269 return false ;
270 }
271
272int WIKI2XML::scan_url ( string &l , int from )
273 {
274 int a ;
275 for ( a = from ; a < l.length() ; a++ )
276 {
277 if ( l[a] == ':' || l[a] == '/' || l[a] == '.' ) continue ;
278 if ( l[a] >= '0' && l[a] <= '9' ) continue ;
279 if ( is_text_char ( l[a] ) ) continue ;
280 break ; // End of URL
281 }
282 return a ;
283 }
284
285void WIKI2XML::parse_external_freelink ( string &l , int &from )
286 {
287 int a ;
288 for ( a = from - 1 ; a >= 0 && is_text_char ( l[a] ) ; a-- ) ;
289 if ( a == -1 ) return ;
290 a++ ;
291 string protocol = upper ( l.substr ( a , from - a ) ) ;
292 if ( debug ) cout << "protocol : " << protocol << endl ;
293 if ( !is_external_link_protocol ( protocol ) ) return ;
294 int to = scan_url ( l , a ) ;
295 string url = l.substr ( a , to - a ) ;
296 string replacement ;
297 replacement += xml_embed ( url , "url" ) ;
298 replacement += xml_embed ( url , "title" ) ;
299 l = left ( l , a ) + replacement + l.substr ( to , l.length() - to ) ;
300 from = a + replacement.length() - 1 ;
301 }
302
303void WIKI2XML::parse_external_link ( string &l , int &from )
304 {
305 string protocol = upper ( before_first ( ':' , l.substr ( from + 1 , l.length() - from ) ) ) ;
306 if ( !is_external_link_protocol ( protocol ) ) return ;
307 int to ;
308 for ( to = from + 1 ; to < l.length() && l[to] != ']' ; to++ ) ;
309 if ( to == l.length() ) return ;
310 string url = l.substr ( from + 1 , to - from - 1 ) ;
311 string title = after_first ( ' ' , url ) ;
312 url = before_first ( ' ' , url ) ;
313 string replacement ;
314 replacement += xml_embed ( url , "url" ) ;
315 if ( title == "" )
316 replacement += xml_embed ( "<wikiurlcounter action=\"add\"/>" , "title" ) ;
317 else replacement += xml_embed ( title , "title" ) ;
318 replacement = xml_embed ( replacement , "wikilink" , "type='external' protocol='" + protocol + "'" ) ;
319 l = left ( l , from ) + replacement + l.substr ( to + 1 , l.length() - to ) ;
320 from = from + replacement.length() - 1 ;
321 }
322
323void WIKI2XML::parse_line_sub ( string &l )
324 {
325 int a ;
326 for ( a = 0 ; a < l.length() ; a++ )
327 {
328 if ( l[a] == '[' && a+1 < l.length() && l[a+1] == '[' ) // [[Link]]
329 parse_link ( l , a , 'L' ) ;
330 else if ( l[a] == '{' && a+1 < l.length() && l[a+1] == '{' ) // {{Template}}
331 parse_link ( l , a , 'T' ) ;
332 else if ( l[a] == '[' ) // External link
333 parse_external_link ( l , a ) ;
334 else if ( a+2 < l.length() && l[a] == ':' && l[a+1] == '/' && l[a+2] == '/' ) // External freelink
335 parse_external_freelink ( l , a ) ;
336 else if ( l[a] == SINGLE_QUOTE ) // Bold and italics
337 {
338 parse_symmetric ( l , a , "'''" , "'''" , "<b>" , "</b>" , true ) ;
339 parse_symmetric ( l , a , "''" , "''" , "<i>" , "</i>" ) ;
340 }
341 }
342 }
343
344void WIKI2XML::parse_lines ( vector <string> &lines )
345 {
346 int a ;
347 for ( a = 0 ; a < lines.size() ; a++ )
348 {
349 parse_line ( lines[a] ) ;
350 }
351
352 string end ;
353
354 // Cleanup lists
355 end = fix_list ( end ) ;
356 if ( end != "" ) lines.push_back ( end ) ;
357
358 // Cleanup tables
359 end = "" ;
360 while ( tables.size() )
361 {
362 end += tables[tables.size()-1].close () ;
363 tables.pop_back () ;
364 }
365 if ( end != "" ) lines.push_back ( end ) ;
366 }
367
368void WIKI2XML::init ( string s )
369 {
370 list = "" ;
371 lines.clear () ;
372
373 // Now we remove evil HTML
374 allowed_html.clear () ;
375 allowed_html.push_back ( "b" ) ;
376 allowed_html.push_back ( "i" ) ;
377 allowed_html.push_back ( "p" ) ;
378 allowed_html.push_back ( "b" ) ;
379 allowed_html.push_back ( "br" ) ;
380 allowed_html.push_back ( "hr" ) ;
381 allowed_html.push_back ( "tt" ) ;
382 allowed_html.push_back ( "pre" ) ;
383 allowed_html.push_back ( "nowiki" ) ;
384 allowed_html.push_back ( "math" ) ;
385 allowed_html.push_back ( "strike" ) ;
386 allowed_html.push_back ( "u" ) ;
387 allowed_html.push_back ( "table" ) ;
388 allowed_html.push_back ( "caption" ) ;
389 allowed_html.push_back ( "tr" ) ;
390 allowed_html.push_back ( "td" ) ;
391 allowed_html.push_back ( "th" ) ;
392 allowed_html.push_back ( "li" ) ;
393 allowed_html.push_back ( "ul" ) ;
394 allowed_html.push_back ( "ol" ) ;
395 allowed_html.push_back ( "dl" ) ;
396 allowed_html.push_back ( "dd" ) ;
397 allowed_html.push_back ( "dt" ) ;
398 allowed_html.push_back ( "div" ) ;
399 allowed_html.push_back ( "h1" ) ;
400 allowed_html.push_back ( "h2" ) ;
401 allowed_html.push_back ( "h3" ) ;
402 allowed_html.push_back ( "h4" ) ;
403 allowed_html.push_back ( "h5" ) ;
404 allowed_html.push_back ( "h6" ) ;
405 allowed_html.push_back ( "h7" ) ;
406 allowed_html.push_back ( "h8" ) ;
407 allowed_html.push_back ( "h9" ) ;
408 allowed_html.push_back ( "small" ) ;
409 allowed_html.push_back ( "center" ) ;
410// allowed_html.push_back ( "" ) ;
411 int a ;
412 for ( a = 0 ; a < allowed_html.size() ; a++ )
413 allowed_html[a] = upper ( allowed_html[a] ) ;
414
415 vector <TXML> taglist ;
416 make_tag_list ( s , taglist ) ;
417 remove_evil_html ( s , taglist ) ;
418
419 // Now evaluate each line
420 explode ( '\n' , s , lines ) ;
421 }
422
423string WIKI2XML::get_xml ()
424 {
425 string xmlheader = "<?xml version='1.0' encoding='UTF-8'?>" ;
426 string ret = xmlheader + "<text>" + implode ( " " , lines ) + "</text>" ;
427
428 // Invalidating mdash
429 int a = ret.find ( "&mdash;" ) ;
430 while ( a >= 0 && a < ret.length() )
431 {
432 ret[a] = '!' ;
433 a = ret.find ( "&mdash;" , a ) ;
434 }
435
436 return ret ;
437 }
438
439void WIKI2XML::replace_part ( string &s , int from , int to , string with )
440 {
441 s = s.substr ( 0 , from ) + with + s.substr ( to + 1 , s.length() - to - 1 ) ;
442 }
443
444void WIKI2XML::replace_part_sync ( string &s , int from , int to , string with , vector <TXML> &list )
445 {
446 int a , b ;
447 replace_part ( s , from , to , with ) ;
448 for ( a = 0 ; a < list.size() ; a++ )
449 {
450 for ( b = 0 ; b < with.length() ; b++ ) list[a].insert_at ( from ) ;
451 for ( b = from ; b <= to ; b++ ) list[a].remove_at ( from ) ;
452 }
453 }
454
455// ATTENTION : this doesn't handle all HTML comments correctly!
456void WIKI2XML::make_tag_list ( string &s , vector <TXML> &list )
457 {
458 list.clear () ;
459 int a , b ;
460 for ( a = 0 ; a < s.length() ; a++ )
461 {
462 if ( s[a] == '>' ) // Rouge >
463 {
464 s[a] = ';' ;
465 s.insert ( a , "&gt" ) ;
466 continue ;
467 }
468 else if ( s[a] != '<' ) continue ;
469 b = find_next_unquoted ( '>' , s , a ) ;
470 if ( b == -1 ) // Rouge <
471 {
472 s[a] = ';' ;
473 s.insert ( a , "&lt" ) ;
474 continue ;
475 }
476 list.push_back ( TXML ( a , b , s ) ) ;
477 a = list[list.size()-1].to ;
478 }
479 }
480
481void WIKI2XML::remove_evil_html ( string &s , vector <TXML> &taglist )
482 {
483 int a , b ;
484 for ( a = 0 ; a < taglist.size() ; a++ )
485 {
486 string tag = upper ( taglist[a].name ) ;
487 for ( b = 0 ; b < allowed_html.size() && tag != allowed_html[b] ; b++ ) ;
488 if ( b < allowed_html.size() ) continue ;
489 replace_part_sync ( s , taglist[a].from , taglist[a].from , "&lt;" , taglist ) ;
490 replace_part_sync ( s , taglist[a].to , taglist[a].to , "&gt;" , taglist ) ;
491 }
492 }
493
494string WIKI2XML::table_markup ( string &l )
495 {
496 int a ;
497 string ret ;
498 if ( left ( l , 2 ) == "{|" ) // Open table
499 {
500 ret = "<wikitable>" ;
501 ret += xml_embed ( l.substr ( 2 , l.length() - 2 ) , "wikiparameter" ) ;
502 tables.push_back ( TTableInfo () ) ;
503 }
504 else if ( left ( l , 2 ) == "|}" )
505 {
506 ret = tables[tables.size()-1].close () ;
507 tables.pop_back () ;
508 }
509 else if ( left ( l , 2 ) == "|-" )
510 {
511 ret = tables[tables.size()-1].new_row () ;
512 for ( a = 1 ; a < l.length() && l[a] == '-' ; a++ ) ;
513 ret += xml_params ( l.substr ( a , l.length() - a ) ) ;
514 }
515 else
516 {
517 string init ;
518 if ( left ( l , 2 ) == "|+" )
519 {
520 init = "caption" ;
521 l = l.substr ( 2 , l.length() - 2 ) ;
522 }
523 else if ( l[0] == '!' )
524 {
525 init = "header" ;
526 l = l.substr ( 1 , l.length() - 1 ) ;
527 }
528 else if ( l[0] == '|' )
529 {
530 init = "cell" ;
531 l = l.substr ( 1 , l.length() - 1 ) ;
532 }
533 vector <string> sublines ;
534 for ( a = 0 ; a + 1 < l.length() ; a++ )
535 {
536 if ( l[a] == '|' && l[a+1] == '|' )
537 {
538 sublines.push_back ( left ( l , a ) ) ;
539 l = l.substr ( a + 2 , l.length() - a ) ;
540 a = -1 ;
541 }
542 }
543 if ( l != "" ) sublines.push_back ( l ) ;
544 for ( a = 0 ; a < sublines.size() ; a++ )
545 {
546 l = sublines[a] ;
547 parse_line_sub ( l ) ;
548 string params ;
549 int b = find_next_unquoted ( '|' , l ) ;
550 if ( b != -1 )
551 {
552 params = left ( l , b ) ;
553 l = l.substr ( b + 1 , l.length() - b ) ;
554 }
555 if ( params != "" ) l = xml_params ( params ) + l ;
556 ret += tables[tables.size()-1].new_cell ( init ) ;
557 ret += l ;
558 }
559 }
560 return ret ;
561 }
Note: See TracBrowser for help on using the repository browser.