source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/parsertest.php@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 3.8 KB
Line 
1<?PHP
2
3error_reporting ( E_ALL ) ;
4
5require_once ( "mediawiki_converter.php" ) ;
6
7function treat ( $s ) {
8 $arr = array ( 'li','p','dd' ) ;
9 foreach ( $arr AS $a ) $s = str_replace ( "</$a>" , "\n</$a>" , $s ) ;
10 $s = htmlentities ( $s ) ;
11# $s = "<pre>$s</pre>" ;
12 $s = str_replace ( "\n" , "<br/>\n" , $s ) ;
13 return $s ;
14}
15
16$lines = explode ( "\n" , str_replace ( "\r" , "" , file_get_contents ( "../../phase3/maintenance/parserTests.txt" ) ) ) ; # This path is for trunk
17
18$tests = array () ;
19$articles = array () ;
20$data = array () ;
21$cmds = array () ;
22$command = '' ;
23$d = '' ;
24foreach ( $lines AS $l ) {
25 if ( $command == '' ) {
26 if ( $l == '' ) continue ;
27 if ( substr ( $l , 0 , 1 ) == '#' ) continue ;
28 }
29 if ( substr ( $l , 0 , 2 ) == '!!' ) {
30 $new_command = strtolower ( trim ( substr ( $l , 2 ) ) ) ;
31 if ( $new_command == 'end' || $new_command == 'endarticle' ) $new_command = '' ; # Simplify
32
33 if ( $new_command == '' ) {
34 if ( $cmds[1] == 'article') {
35 $articles[trim($data['article'])] = $d ;
36 } else if ( $cmds[1] == 'test') {
37 $t = '' ;
38 $t->name = trim ( $data['test'] ) ;
39 $t->input = $data['input'] ;
40 if ( isset ( $data['options'] ) ) $t->options = $data['options'] ;
41 else $t->options = '' ;
42 $t->result = $d ;
43 $tests[] = $t ;
44 }
45 $data = array () ;
46 $cmds = array () ;
47 } else {
48 $data[$command] = $d ;
49 $cmds[] = $command ;
50 }
51 $d = '' ;
52 $command = $new_command ;
53 } else {
54 if ( $d != '' ) $d .= "\n" ;
55 $d .= $l ;
56 }
57}
58
59# Run tests
60$xmlg["useapi"] = false ;
61$xmlg["book_title"] = 'Title';
62$xmlg["site_base_url"] = 'en.wikipedia.org/w' ;
63$xmlg["resolvetemplates"] = 'all' ;
64$xmlg['templates'] = array () ;
65$xmlg['add_gfdl'] = false ;
66$xmlg['keep_interlanguage'] = true ;
67$xmlg['keep_categories'] = true ;
68$xmlg['xml_articles_header'] = "<articles>" ;
69
70$xmlg['xhtml_justify'] = false ;
71$xmlg['xhtml_logical_markup'] = false ;
72$xmlg['xhtml_source'] = false ;
73
74
75$cnt = 1 ;
76print "<table border=1 width='100%'><tr><th>Test</th><th>Result</th><th>wiki2xml</th><th>Input</th><th>XML</th></tr>" ;
77foreach ( $tests AS $t ) {
78 $res = $t->result ;
79 $col = '' ;
80 $content_provider = new ContentProviderHTTP ;
81 $converter = new MediaWikiConverter ;
82 $xml = $converter->article2xml ( "" , $t->input , $xmlg ) ;
83 $nr = $converter->articles2xhtml ( $xml , $xmlg ) ;
84 $nr = array_pop ( explode ( '<body>' , $nr , 2 ) ) ;
85 $nr = array_shift ( explode ( '</body>' , $nr , 2 ) ) ;
86
87 # Fixing things to compare to the stupid parser test formatting
88 $res = trim ( $res ) ;
89 $res = str_replace ( "<li> " , "<li>" , $res ) ;
90 $res = str_replace ( "<dd> " , "<dd>" , $res ) ;
91 $res = str_replace ( "\n<" , "<" , $res ) ;
92 $res = str_replace ( "\n" , " " , $res ) ;
93 $res = str_replace ( " </p>" , "</p>" , $res ) ;
94 do { $o = $res ; $res = str_replace ( " " , " " , $res ) ; } while ( $o != $res ) ;
95
96 $nr = trim ( $nr ) ;
97 $nr = str_replace ( "<li> " , "<li>" , $nr ) ;
98 $nr = str_replace ( "<dd> " , "<dd>" , $nr ) ;
99 $nr = str_replace ( "\n<" , "<" , $nr ) ;
100 $nr = str_replace ( "\r" , "" , $nr ) ;
101 $nr = str_replace ( "\n" , " " , $nr ) ;
102 do { $o = $nr ; $nr = str_replace ( " " , " " , $nr ) ; } while ( $o != $nr ) ;
103
104
105# $arr = array ( 'li' ) ;
106# foreach ( $arr AS $a ) $nr = str_replace ( "<$a>" , "<$a> " , $nr ) ;
107
108
109 # Indicator color
110 $col = 'red' ;
111 if ( $res == $nr ) $col = 'green' ;
112# $nr = str_replace ( '</' , "\n</" , $xml ) ;
113
114
115
116 print "<tr><th bgcolor='$col'>" . treat ( $t->name ) . "</th>" ;
117 print "<td>" . treat ( $res ) . "</td>" ;
118 print "<td>" . treat ( $nr ) . "</td>" ;
119 print "<td>" . treat ( $t->input ) . "</td>" ;
120 print "<td>" . treat ( $xml ) . "</td>" ;
121
122 $cnt++ ;
123 if ( $cnt > 40 ) break ;
124}
125print "</table>" ;
126
127?>
Note: See TracBrowser for help on using the repository browser.