1 | <?php
|
---|
2 |
|
---|
3 | # Change there to your local settings
|
---|
4 | $dumpfile = "K:\\dewiki-20060327-pages-articles.xml" ;
|
---|
5 | $basedir = "C:" ;
|
---|
6 |
|
---|
7 | #______________________________________________________________________________
|
---|
8 | # GLOBAL VARIABLES
|
---|
9 | $dir = "" ;
|
---|
10 | $namespaces = array () ;
|
---|
11 | $mem = array () ;
|
---|
12 | $tags = array () ;
|
---|
13 | $page_counter = 0 ;
|
---|
14 |
|
---|
15 | # FUNCTIONS
|
---|
16 |
|
---|
17 | require_once ( "global_functions.php" ) ;
|
---|
18 |
|
---|
19 | function store_file ( &$loc , &$text , $mode = "text" ) {
|
---|
20 | if ( $mode == "text" ) {
|
---|
21 | if ( !$handle = fopen($loc->fullname.".txt", 'wb') ) {
|
---|
22 | print "Failed to open {$loc->file}.txt!<br/>" ;
|
---|
23 | flush () ;
|
---|
24 | }
|
---|
25 | fwrite($handle, $text) ;
|
---|
26 | fclose ( $handle ) ;
|
---|
27 | } else if ( $mode == "gzip" ) {
|
---|
28 | if ( !$gz = gzopen($loc->fullname.".gz",'w9') ) {
|
---|
29 | print "Failed to open {$loc->file}.gz!<br/>" ;
|
---|
30 | flush () ;
|
---|
31 | }
|
---|
32 | gzwrite($gz, $text);
|
---|
33 | gzclose($gz);
|
---|
34 | }
|
---|
35 | }
|
---|
36 |
|
---|
37 | function microtime_float()
|
---|
38 | {
|
---|
39 | list($usec, $sec) = explode(" ", microtime());
|
---|
40 | return ((float)$usec + (float)$sec);
|
---|
41 | }
|
---|
42 |
|
---|
43 | # Global functions for parsing
|
---|
44 |
|
---|
45 | function XML2TXT_START($parser, $name, $attrs) {
|
---|
46 | global $mem , $tags ;
|
---|
47 | $mem["name"] = $name ;
|
---|
48 | $tags[] = $name ;
|
---|
49 | if ( $name == "NAMESPACE" ) {
|
---|
50 | $mem['key'] = $attrs["KEY"] ;
|
---|
51 | } else if ( $name == "TEXT" ) {
|
---|
52 | $mem['text'] = "" ;
|
---|
53 | }
|
---|
54 | }
|
---|
55 |
|
---|
56 | function XML2TXT_END($parser, $name) {
|
---|
57 | global $mem , $namespaces , $tags , $page_counter , $dir ;
|
---|
58 | if ( $mem['name'] == 'NAMESPACE' ) {
|
---|
59 | $namespaces[$mem['key']] = $mem['text'] ;
|
---|
60 | } else if ( $mem['name'] == 'PAGE' ) {
|
---|
61 | $loc = get_file_location_global ( $dir , $mem['namespace'] , $mem['title'] , true ) ;
|
---|
62 | store_file ( $loc , $mem['text'] , 'text' ) ;
|
---|
63 |
|
---|
64 | $page_counter++ ;
|
---|
65 | if ( $page_counter % 1000 == 0 ) {
|
---|
66 | print '.' ;
|
---|
67 | if ( $page_counter % 50000 == 0 ) print "<br/>" ;
|
---|
68 | flush () ;
|
---|
69 | }
|
---|
70 | }
|
---|
71 |
|
---|
72 | array_pop ( $tags ) ;
|
---|
73 | if ( count ( $tags ) > 0 ) {
|
---|
74 | $mem['name'] = array_pop ( $tags ) ;
|
---|
75 | $tags[] = $mem['name'] ;
|
---|
76 | } else {
|
---|
77 | $mem['name'] = "" ;
|
---|
78 | }
|
---|
79 | }
|
---|
80 |
|
---|
81 | function XML2TXT_DATA ( $parser, $data ) {
|
---|
82 | global $mem , $namespaces ;
|
---|
83 | if ( $mem['name'] == 'NAMESPACE' ) {
|
---|
84 | $mem['text'] = $data ;
|
---|
85 | } else if ( $mem['name'] == 'TITLE' ) {
|
---|
86 | $ns = 0 ;
|
---|
87 | foreach ( $namespaces AS $k => $v ) {
|
---|
88 | if ( $k <= 0 ) continue ;
|
---|
89 | if ( substr ( 0 , strlen ( $v ) + 1 ) != $v.":" ) continue ;
|
---|
90 | $ns = $k ;
|
---|
91 | $data = substr ( $data , strlen ( $v ) + 1 ) ;
|
---|
92 | break ;
|
---|
93 | }
|
---|
94 | $mem['title'] = $data ;
|
---|
95 | $mem['namespace'] = $ns ;
|
---|
96 | } else if ( $mem['name'] == 'TEXT' ) {
|
---|
97 | $mem['text'] .= $data ;
|
---|
98 | }
|
---|
99 | }
|
---|
100 |
|
---|
101 | function scan_xml_file ( $xml_filename ) {
|
---|
102 | global $namespaces , $dir , $page_counter ;
|
---|
103 | $xml_parser_handle = xml_parser_create();
|
---|
104 | xml_set_element_handler($xml_parser_handle, "XML2TXT_START", "XML2TXT_END");
|
---|
105 | xml_set_character_data_handler($xml_parser_handle, "XML2TXT_DATA");
|
---|
106 |
|
---|
107 | if (!($parse_handle = fopen($xml_filename, 'r'))) {
|
---|
108 | die("FEHLER: Datei $xml_filename nicht gefunden.");
|
---|
109 | }
|
---|
110 |
|
---|
111 | $t1 = microtime_float() ;
|
---|
112 | while ($xml_data = fread($parse_handle, 8192)) {
|
---|
113 | if (!xml_parse($xml_parser_handle, $xml_data, feof($parse_handle))) {
|
---|
114 | die(sprintf('XML error: %s at line %d',
|
---|
115 | xml_error_string(xml_get_error_code($xml_parser_handle)),
|
---|
116 | xml_get_current_line_number($xml_parser_handle)));
|
---|
117 | }
|
---|
118 |
|
---|
119 | /* if ( $page_counter % 100 == 0 ) {
|
---|
120 | $t2 = microtime_float() - $t1 ;
|
---|
121 | $t3 = $t2 * 1000 / $page_counter ;
|
---|
122 | print $t3 . " sec/1000 pages<br/>" ; flush () ;
|
---|
123 | }*/
|
---|
124 | }
|
---|
125 | $t2 = microtime_float() - $t1 ;
|
---|
126 | print "Took {$t2} seconds total.<br/>" ; flush () ;
|
---|
127 |
|
---|
128 | xml_parser_free($xml_parser_handle);
|
---|
129 |
|
---|
130 | $handle = fopen($dir."/namespaces.txt", 'wb') ;
|
---|
131 | foreach ( $namespaces AS $ns => $nst ) {
|
---|
132 | $t = "{$ns}:{$nst}\n" ;
|
---|
133 | fwrite($handle, $t) ;
|
---|
134 | }
|
---|
135 | fclose ( $handle ) ;
|
---|
136 |
|
---|
137 | }
|
---|
138 |
|
---|
139 |
|
---|
140 | # MAIN
|
---|
141 |
|
---|
142 | $dir = array_pop ( explode ( "/" , str_replace ( "\\" , "/" , $dumpfile ) ) ) ;
|
---|
143 | $dir = $basedir . "/" . str_replace ( ".xml" , "" , $dir ) ;
|
---|
144 |
|
---|
145 | @set_time_limit ( 0 ) ; # No time limit
|
---|
146 | #ini_set('user_agent','MSIE 4\.0b2;'); # Fake user agent
|
---|
147 | header ('Content-type: text/html; charset=utf-8');
|
---|
148 | @mkdir ( $dir ) ;
|
---|
149 | scan_xml_file ( $dumpfile ) ;
|
---|
150 |
|
---|
151 | ?>
|
---|