source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/content_provider.php@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 11.9 KB
Line 
1<?php
2
3# Abstract base class
4class ContentProvider {
5 var $load_time = 0 ; # Time to load text and templates, to judge actual parsing speed
6 var $article_list = array () ;
7 var $authors = array () ;
8 var $block_file_download = false ;
9
10 function get_wiki_text ( $title , $do_cache = false ) { return "" ; } # dummy
11 function get_template_text ( $title ) { return "" ; } # dummy
12
13 function add_article ( $title ) {
14 $this->article_list[] = urlencode ( trim ( $title ) ) ;
15 }
16
17 function is_an_article ( $title ) {
18 $title = urlencode ( trim ( $title ) ) ;
19 return in_array ( $title , $this->article_list ) ;
20 }
21
22 /**
23 * XXX TODO: why are some negative?
24 * Gets the numeric namespace
25 * "6" = images
26 * "-8" = category link
27 * "-9" = interlanguage link
28 * "11" = templates
29 */ function get_namespace_id ( $text ) {
30 $text = strtoupper ( $text ) ;
31 $text = explode ( ":" , $text , 2 ) ;
32 if ( count ( $text ) != 2 ) return 0 ;
33 $text = trim ( array_shift ( $text ) ) ;
34 if ( $text == "" ) return 0 ;
35 $ns = 0 ;
36
37 if ( $text == "CATEGORY" || $text == "KATEGORIE" ) return -8 ; # Hackish, for category link
38 if ( strlen ( $text ) < 4 ) return -9 ; # Hackish, for interlanguage link
39 if ( $text == "SIMPLE" ) return -9 ;
40
41 # Horrible manual hack, for now
42 if ( $text == "IMAGE" || $text == "BILD" ) $ns = 6 ;
43 if ( $text == "TEMPLATE" || $text == "VORLAGE" ) $ns = 11 ;
44
45 return $ns ;
46 }
47
48 function copyimagefromwiki ( $name , $url = "" ) {
49 global $xmlg ;
50 $dir = $xmlg['image_destination'] ;
51 if ( $url == "" )
52 $url = $this->get_image_url ( name ) ;
53 $fname = urlencode ( $name ) ;
54 $target = $dir . "/" . $fname ;
55 if ( !file_exists ( $target ) && !$this->block_file_download ) {
56 @mkdir ( $dir ) ;
57 # dub sez... use cURL
58 $ch = curl_init();
59 curl_setopt($ch, CURLOPT_URL, $url);
60 $fh = @fopen($target, 'w');
61 curl_setopt($ch, CURLOPT_FILE, $fh);
62 curl_exec($ch);
63 curl_close($ch);
64 @fclose($fh);
65 }
66 return $fname ;
67 }
68
69 function myurlencode ( $t ) {
70 $t = str_replace ( " " , "_" , $t ) ;
71 $t = urlencode ( $t ) ;
72 return $t ;
73 }
74
75
76 function get_image_url ( $name ) {
77 global $xmlg ;
78 $site = $xmlg['site_base_url'] ;
79 $parts = explode ( ".wikipedia.org/" , $site ) ;
80 $parts2 = explode ( ".wikibooks.org/" , $site ) ;
81
82 $image = utf8_encode ( $name ) ;
83 $image2 = ucfirst ( str_replace ( " " , "_" , $name ) ) ;
84 $m = md5( $image2 ) ;
85 $m1 = substr ( $m , 0 , 1 ) ;
86 $m2 = substr ( $m , 0 , 2 ) ;
87 $i = "{$m1}/{$m2}/" . $this->myurlencode ( ucfirst ( $name ) ) ;
88
89
90 if ( count ($parts ) > 1 ) {
91 $lang = array_shift ( $parts ) ;
92 $url = "http://upload.wikimedia.org/wikipedia/{$lang}/{$i}" ;
93 $url2 = "http://upload.wikimedia.org/wikipedia/commons/{$i}" ;
94 $h = @fopen ( $url , "r" ) ;
95 if ( $h === false ) $url = $url2 ;
96 else fclose ( $h ) ;
97 } else if ( count ($parts2 ) > 1 ) {
98 $lang = array_shift ( $parts2 ) ;
99 $url = "http://upload.wikimedia.org/wikibooks/{$lang}/{$i}" ;
100 $url2 = "http://upload.wikimedia.org/wikipedia/commons/{$i}" ;
101 $h = @fopen ( $url , "r" ) ;
102 if ( $h === false ) $url = $url2 ;
103 else fclose ( $h ) ;
104 } else {
105 $url = "http://{$site}/images/{$i}" ;
106 }
107# print "<a href='{$url}'>{$url}</a><br/>" ;
108 return $url ;
109 }
110
111 function do_show_images () {
112 return true ;
113 }
114
115}
116
117
118# Access through HTTP protocol
119class ContentProviderHTTP extends ContentProvider {
120 var $article_cache = array () ;
121 var $first_title = "" ;
122 var $load_error ;
123
124 function between_tag ( $tag , &$text ) {
125 $a = explode ( "<{$tag}" , $text , 2 ) ;
126 if ( count ( $a ) == 1 ) return "" ;
127 $a = explode ( ">" , " " . array_pop ( $a ) , 2 ) ;
128 if ( count ( $a ) == 1 ) return "" ;
129 $a = explode ( "</{$tag}>" , array_pop ( $a ) , 2 ) ;
130 if ( count ( $a ) == 1 ) return "" ;
131 return array_shift ( $a ) ;
132 }
133
134 function do_get_contents ( $title ) {
135 global $xmlg ;
136 $use_se = false ;
137 if ( isset ( $xmlg["use_special_export"] ) && $xmlg["use_special_export"] == 1 ) $use_se = true ;
138
139 if ( $xmlg["useapi"] ) {
140 $url = "http://" . $xmlg["site_base_url"] . "/api.php?format=php&action=query&prop=revisions&rvexpandtemplates=1&rvprop=timestamp|user|comment|content&titles=" . urlencode ( $title ) ;
141 $data = @file_get_contents ( $url ) ;
142 $data = unserialize ( $data ) ;
143 $data = $data['query'] ; if ( !isset ( $data ) ) return "" ;
144 $data = $data['pages'] ; if ( !isset ( $data ) ) return "" ;
145 $data = array_shift ( $data ) ;
146 $data = $data['revisions'] ; if ( !isset ( $data ) ) return "" ;
147 $data = $data['0'] ; if ( !isset ( $data ) ) return "" ;
148 $data = $data['*'] ; if ( !isset ( $data ) ) return "" ;
149 return $data ;
150# $data = $data['page'] ; if ( !isset ( $data ) ) return "" ;
151# $data = $data['revision'] ; if ( !isset ( $data ) ) return "" ;
152# $data = $data['ref'] ; if ( !isset ( $data ) ) return "" ;
153#print urldecode ( $url ) . "\n" ;
154 print "<pre>" ; print_r ( $data ) ; print "</pre>" ;
155 exit ;
156 $s = "Still here..." ;
157 return $s ;
158 } else if ( $use_se ) {
159 $url = "http://" . $xmlg["site_base_url"] . "/index.php?listauthors=1&title=Special:Export/" . urlencode ( $title ) ;
160 } else {
161 if ( $xmlg["use_toolserver_url"] ) {
162# $url = "http://" . $xmlg["site_base_url"] . "/index.php?action=raw&title=" . urlencode ( $title ) ;
163 $u = urlencode ( $title ) ;
164 $site = array_shift ( explode ( "/" , $xmlg["site_base_url"] ) ) ;
165 $url = "http://tools.wikimedia.de/~daniel/WikiSense/WikiProxy.php?wiki={$site}&title={$u}&rev=0&go=Fetch" ;
166 } else {
167 $url = "http://" . $xmlg["site_base_url"] . "/index.php?action=raw&title=" . urlencode ( $title ) ;
168 }
169 }
170 $s = @file_get_contents ( $url ) ;
171
172 if ( $use_se ) {
173 $text = html_entity_decode ( $this->between_tag ( "text" , $s ) ) ;
174 $this->authors = array () ;
175 $authors = $this->between_tag ( "contributors" , $s ) ;
176 $authors = explode ( "</contributor><contributor>" , $authors ) ;
177 foreach ( $authors AS $author ) {
178 $id = $this->between_tag ( "id" , $author ) ;
179 if ( $id == '0' || $id == '' ) continue ; # Skipping IPs and (possibly) broken entries
180 $name = $this->between_tag ( "username" , $author ) ;
181 $this->authors[] = $name ;
182 }
183 $s = $text ;
184 }
185 return $s ;
186 }
187
188 function get_wiki_text ( $title , $do_cache = false ) {
189 global $xmlg ;
190 $load_error = false ;
191 $title = trim ( $title ) ;
192 if ( $title == "" ) return "" ; # Just in case...
193 if ( isset ( $this->article_cache[$title] ) ) # Already in the cache
194 return $this->article_cache[$title] ;
195
196 if ( $this->first_title == "" ) $this->first_title = $title ;
197
198 # Retrieve it
199 $t1 = microtime_float() ;
200 $s = $this->do_get_contents ( $title ) ;
201 if ( strtoupper ( substr ( $s , 0 , 9 ) ) == "#REDIRECT" ) {
202 $t2 = explode ( "[[" , $s , 2 ) ;
203 $t2 = array_pop ( $t2 ) ;
204 $t2 = explode ( "]]" , $t2 , 2 ) ;
205 $t2 = array_shift ( $t2 ) ;
206 $s = $this->do_get_contents ( $t2 ) ;
207 }
208 $this->load_time += microtime_float() - $t1 ;
209
210 $comp = '<!DOCTYPE html PUBLIC "-//W3C//DTD' ;
211 if ( substr ( $s , 0 , strlen ( $comp ) ) == $comp ) $s = "" ; # Catching wrong title error
212
213 if ( $do_cache ) $this->article_cache[$title] = $s ;
214 return $s ;
215 }
216
217 function get_local_url ( $title ) {
218 return "/" . array_pop ( explode ( "/" , $this->get_var ( 'site_base_url' ) , 2 ) ) . "/index.php?title=" . urlencode ( $title ) ;
219 }
220
221 function get_server_url () {
222 return "http://" . array_shift ( explode ( "/" , $this->get_var ( 'site_base_url' ) , 2 ) ) ;
223 }
224
225 function get_full_url ( $title ) {
226 return $this->get_server_url () . $this->get_local_url ( $title ) ;
227 }
228
229 function get_namespace_template () {
230 return $this->get_var ( 'namespace_template' ) ;
231 }
232
233 function get_var ( $var ) {
234 global $xmlg ;
235 if ( !isset ( $xmlg[$var] ) ) return false ;
236 return $xmlg[$var] ;
237 }
238
239 function get_template_text ( $title ) {
240 # Check for fix variables
241 if ( $title == "PAGENAME" ) return $this->first_title ;
242 if ( $title == "PAGENAMEE" ) return urlencode ( $this->first_title ) ;
243 if ( $title == "SERVER" ) return $this->get_server_url () ;
244 if ( $title == "CURRENTDAYNAME" ) return date ( "l" ) ;
245 if ( strtolower ( substr ( $title , 0 , 9 ) ) == "localurl:" )
246 return $this->get_local_url ( substr ( $title , 9 ) ) ;
247
248 $title = trim ( $title ) ;
249 if ( count ( explode ( ":" , $title , 2 ) ) == 1 ) # Does the template title contain a ":"?
250 $title = $this->get_namespace_template() . ":" . $title ;
251 else if ( substr ( $title , 0 , 1 ) == ":" ) # Main namespace
252 $title = substr ( $title , 1 ) ;
253 return $this->get_wiki_text ( $title , true ) ; # Cache template texts
254 }
255
256 function get_internal_link ( $target , $text ) {
257 return $text ; # Dummy
258 }
259}
260
261
262
263
264# Access through text file structure
265class ContentProviderTextFile extends ContentProviderHTTP {
266 var $file_ending = ".txt" ;
267
268 function do_get_contents ( $title ) {
269 return $this->get_page_text ( $title ) ;
270 }
271
272 /**
273 Called from outside
274 Could probably remained unchanged from HTTP class, but this is shorter, and caching is irrelevant for text files (disk cache)
275 */
276 function get_wiki_text ( $title , $do_cache = false ) {
277 $title = trim ( $title ) ;
278 if ( $title == "" ) return "" ; # Just in case...
279 if ( $this->first_title == "" ) {
280 $this->first_title = $title ;
281 }
282 $text = $this->get_page_text ( $title ) ;
283 return $text ;
284 }
285
286 function get_file_location ( $ns , $title ) {
287 return get_file_location_global ( $this->basedir , $ns , $title , false ) ;
288 }
289
290 function get_page_text ( $page , $allow_redirect = true ) {
291 $filename = $this->get_file_location ( 0 , $page ) ;
292 $filename = $filename->fullname . $this->file_ending ;
293 if ( !file_exists ( $filename ) ) return "" ;
294 $text = trim ( file_get_contents ( $filename ) ) ;
295
296 # REDIRECT?
297 if ( $allow_redirect && strtoupper ( substr ( $text , 0 , 9 ) ) == "#REDIRECT" ) {
298 $text = substr ( $text , 9 ) ;
299 $text = array_shift ( explode ( "\n" , $text , 2 ) ) ;
300 $text = str_replace ( "[[" , "" , $text ) ;
301 $text = str_replace ( "]]" , "" , $text ) ;
302 $text = ucfirst ( trim ( $text ) ) ;
303 $text = $this->get_page_text ( $text , false ) ;
304 }
305 return $text ;
306 }
307
308 function get_internal_link ( $target , $text ) {
309 $file = $this->get_file_location ( 0 , $target ) ;
310 if ( !file_exists ( $file->fullname.$this->file_ending ) ) return $text ;
311 else return "<a href='browse_texts.php?title=" . urlencode ( $target ) . "'>{$text}</a>" ;
312 }
313
314 function do_show_images () {
315 return false ;
316 }
317
318}
319
320# Access through MySQL interface
321# (Used via the extension via Special::wiki2XML)
322class ContentProviderMySQL extends ContentProviderHTTP {
323
324 function do_get_contents ( $title ) {
325 return $this->get_page_text ( $title ) ;
326 }
327
328 /**
329 Called from outside
330 */
331 function get_wiki_text ( $title , $do_cache = false ) {
332 $title = trim ( $title ) ;
333 if ( $title == "" ) return "" ; # Just in case...
334 if ( $this->first_title == "" ) {
335 $this->first_title = $title ;
336 }
337 $text = $this->get_page_text ( $title ) ;
338 return $text ;
339 }
340
341 function get_file_location ( $ns , $title ) {
342 return get_file_location_global ( $this->basedir , $ns , $title , false ) ;
343 }
344
345 function get_page_text ( $page , $allow_redirect = true ) {
346 $title = Title::newFromText ( $page ) ;
347 $article = new Article ( $title ) ;
348
349 # article does not exist?
350 if (!$article->exists()) {
351 return "";
352 }
353 $text = $article->getContent () ;
354
355 # REDIRECT?
356 if ( $allow_redirect && strtoupper ( substr ( $text , 0 , 9 ) ) == "#REDIRECT" ) {
357 $text = substr ( $text , 9 ) ;
358 $text = array_shift ( explode ( "\n" , $text , 2 ) ) ;
359 $text = str_replace ( "[[" , "" , $text ) ;
360 $text = str_replace ( "]]" , "" , $text ) ;
361 $text = ucfirst ( trim ( $text ) ) ;
362 $text = $this->get_page_text ( $text , false ) ;
363 }
364 return $text ;
365 }
366
367 function get_internal_link ( $target , $text ) {
368 $file = $this->get_file_location ( 0 , $target ) ;
369 if ( !file_exists ( $file->fullname.$this->file_ending ) ) return $text ;
370 else return "<a href='browse_texts.php?title=" . urlencode ( $target ) . "'>{$text}</a>" ;
371 }
372
373 function do_show_images () {
374 return false ;
375 }
376
377}
378
379?>
Note: See TracBrowser for help on using the repository browser.