source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/w2x.php

Last change on this file was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 17.5 KB
Line 
1<?php
2# Copyright by Magnus Manske (2005 - 2006)
3# Released under GPL
4
5@set_time_limit ( 0 ) ; # No time limit
6ini_set ('memory_limit', 1024*1024*64); # 64MB upper limit
7ini_set('user_agent','MSIE 4\.0b2;'); # Fake user agent
8
9if( !defined( 'MEDIAWIKI' ) ) { # Stand-alone
10 include_once ( "default.php" ) ; # Which will include local.php, if available
11}
12
13require_once ( "mediawiki_converter.php" ) ;
14
15## TIMER FUNCTION
16
17function microtime_float()
18{
19 list($usec, $sec) = explode(" ", microtime());
20 return ((float)$usec + (float)$sec);
21}
22
23function get_form ( $as_extension = false ) {
24 global $xmlg ;
25
26 $_wt = get_param ( 'whatsthis' , 'articlelist' ) ;
27 $wt['wikitext'] = $wt['articlelist'] = $wt['listpagename'] = "" ;
28 $wt[$_wt] = 'checked' ;
29
30 $xmlg["site_base_url"] = get_param('site',$xmlg["site_base_url"]) ;
31 $_out = get_param ( 'output_format' , 'xml' ) ;
32 $out['xml'] = $out['text'] = $out['translated_text'] = $out['xhtml'] = $out['docbook_xml'] = $out['docbook_pdf'] = $out['docbook_html'] =
33 $out['odt_xml'] = $out['odt'] = '' ;
34 $out[$_out] = 'checked' ;
35 $useapi_checked = $xmlg["useapi"] ? 'checked' : '' ;
36
37 $optional = array () ;
38 if ( isset ( $xmlg['docbook']['command_pdf'] ) ) {
39 $optional[] = "<INPUT {$out['docbook_pdf']} type='radio' name='output_format' value='docbook_pdf'>DocBook PDF" ;
40 }
41 if ( isset ( $xmlg['docbook']['command_html'] ) ) {
42 $optional[] = "<INPUT {$out['docbook_html']} type='radio' name='output_format' value='docbook_html'>DocBook HTML" ;
43 }
44 if ( isset ( $xmlg['zip_odt'] ) ) {
45 $optional[] = "<INPUT {$out['odt_xml']} type='radio' name='output_format' value='odt_xml'>OpenOffice XML" ;
46 $optional[] = "<INPUT {$out['odt']} type='radio' name='output_format' value='odt'>OpenOffice ODT" .
47 "<input type='checkbox' name='odt_footnote' value='1' ".(get_param('odt_footnote',false)?'checked':'').">References as endnotes (instead of footnotes)" ;
48 }
49 $optional = "<br/>" . implode ( "<br/>" , $optional ) ;
50
51 # dub sez... working images
52 if ( $as_extension ) $site = "<input type='hidden' name='site' value='".$xmlg["site_base_url"]."'/>" ;
53 else $site = "Site : http://<input type='text' name='site' value='".$xmlg["site_base_url"]."'/>/index.php<br/>" ;
54
55 $additional = array() ;
56 if ( $xmlg['allow_get'] ) {
57 $additional[] = "This page can be called with parameters: w2x.php?doit=1&whatsthis=articlelist&site=en.wikipedia.org/w&output_format=odt&text=Biochemistry" ;
58 $additional[] = "For additional parameters, see <a href='README'>here</a>" ;
59 }
60
61 # Plain text translation options
62 $a = array (
63 'en' => 'English',
64 'de' => 'German',
65 'fr' => 'French',
66 'es' => 'Spanish',
67 'it' => 'Italian',
68 ) ;
69 asort ( $a ) ;
70 $tttlo = "" ;
71 foreach ( $a AS $b => $c ) {
72 $tttlo .= "<option value='{$b}'>{$c}</option>" ;
73 }
74
75 $additional = "<div style='text-align:center; border-top:1px solid black;width:100%;font-size:12px'>" .
76 implode ( "<br/>" , $additional ) .
77 "</div>" ;
78
79return "<form method='post'>
80<h2>Paste article list or wikitext here</h2>
81<table border='0' width='100%'><tr>
82<td valign='top'><textarea rows='20' cols='80' style='width:100%' name='text'>" .
83get_param ( 'text' , '' ) .
84"</textarea></td>
85<td width='200px' valign='top' nowrap>
86<INPUT checked type='radio' name='use_templates' value='all'>Use all templates<br/>
87<INPUT type='radio' name='use_templates' value='none'>Do not use templates<br/>
88<INPUT type='radio' name='use_templates' value='these'>Use these templates<br/>
89<INPUT type='radio' name='use_templates' value='notthese'>Use all but these templates<br/>
90<textarea rows='15' cols='30' style='width:100%' name='templates'></textarea>
91</td></tr></table>
92<table border='0'><tr>
93<td valign='top'>
94This is<br/>
95<INPUT {$wt['wikitext']} type='radio' name='whatsthis' value='wikitext'>raw wikitext <br/>
96<INPUT {$wt['articlelist']} type='radio' name='whatsthis' value='articlelist'>a list of articles<br/>
97<INPUT {$wt['listpagename']} type='radio' name='whatsthis' value='listpagename'>the name of an article with a list of pages<br/>
98
99{$site}
100<input type='checkbox' name='useapi' value='1' id='useapi' $useapi_checked /><label for='useapi'>Use API</label><br/>
101Title : <input type='text' name='document_title' value='' size=40/><br/>
102<input type='checkbox' name='add_gfdl' value='1' checked>Include GFDL (for some output formats)</input><br/>
103<input type='checkbox' name='keep_categories' value='1' checked>Keep categories</input><br/>
104<input type='checkbox' name='keep_interlanguage' value='1' checked>Keep interlanguage links</input><br/>
105<input type='submit' name='doit' value='Convert'/>
106</td><td valign='top' style='border-left:1px black solid'>
107<b>Output</b>
108<br/><INPUT {$out['xml']} type='radio' name='output_format' value='xml'>XML
109<br/><INPUT {$out['text']} type='radio' name='output_format' value='text'>Plain text
110 <input type='checkbox' name='plaintext_markup' value='1' ".(get_param('plaintext_markup',true)?'checked':'').">Use *_/ markup</input>
111 <input type='checkbox' name='plaintext_prelink' value='1' ".(get_param('plaintext_prelink',true)?'checked':'').">Put &rarr; before internal links</input>
112<br/><INPUT {$out['translated_text']} type='radio' name='output_format' value='translated_text'>Plain text, google-translated to
113 <select name='translated_text_target_language'>{$tttlo}</select> (works only for wikipedia/wikibooks)
114<br/><INPUT {$out['xhtml']} type='radio' name='output_format' value='xhtml'>XHTML
115 <input type='checkbox' name='xhtml_justify' value='1' ".(get_param('xhtml_justify',true)?'checked':'').">Align paragraphs as 'justify'</input>
116 <input type='checkbox' name='xhtml_logical_markup' value='1' ".(get_param('xhtml_logical_markup',true)?'checked':'').">Use logical markup (e.g., 'strong' instead of 'b')</input>
117 <input type='checkbox' name='xhtml_source' value='1' ".(get_param('xhtml_source',false)?'checked':'').">Return source XHTML</input>
118<br/><INPUT {$out['docbook_xml']} type='radio' name='output_format' value='docbook_xml'>DocBook XML
119{$optional}
120</tr></table>
121</form>
122<p>
123Known issues:
124<ul>
125<li>In templates, {{{variables}}} used within &lt;nowiki&gt; tags will be replaced as well (too lazy to strip them)</li>
126<li>HTML comments are removed (instead of converted into XML tags)</li>
127</ul>{$additional}
128</p>" ;
129}
130
131function get_param ( $s , $default = NULL ) {
132 global $xmlg ;
133 if ( $xmlg['allow_get'] ) {
134 if ( isset ( $_REQUEST[$s] ) ) {
135 return $_REQUEST[$s] ;
136 } else {
137 return $default ;
138 }
139 } else {
140 if ( isset ( $_POST[$s] ) ) {
141 return $_POST[$s] ;
142 } else {
143 return $default ;
144 }
145 }
146}
147
148# add one article to the stack of to-be-converted articles
149function push_article ( &$aArticles, $article ) {
150
151 # convert _ to ' '
152 $a = trim( $article );
153 if ( $a != "" ) {
154 $aArticles[] = preg_replace( '/_/', ' ', $a );
155 }
156
157}
158
159# Append XML, or links to XML temporary files
160function append_to_xml ( &$xml , $new_xml ) {
161 global $xmlg ;
162 if ( $xmlg["use_xml_temp_files"] ) { # Use temp files
163 if ( !is_array ( $xml ) ) $xml = array () ;
164
165 do {
166 $tmp_file_name = tempnam ( $xmlg["temp_dir"] , "XMLTMP" ) ;
167 $tmp_file = fopen($tmp_file_name, 'wb') ;
168 } while ( $tmp_file === false ) ;
169 fwrite ( $tmp_file , $new_xml ) ;
170 fclose ( $tmp_file ) ;
171
172 $xml[] = $tmp_file_name ;
173 } else { # Do not use temp files
174 $xml .= $new_xml ;
175 }
176}
177
178# Returns the next article XML, or false
179function xml_shift ( &$xml ) {
180 if ( !is_array ( $xml ) ) { # Do not use temp files
181 if ( $xml != '' ) {
182 $x = $xml ;
183 $xml = array () ;
184 return $x ;
185 }
186 return false ;
187 } else { # Use temp files
188 if ( count ( $xml ) == 0 ) return false ;
189 $x = array_shift ( $xml ) ;
190 $ret = file_get_contents ( $x ) ;
191 unlink ( $x ) ;
192 return $ret ;
193 }
194}
195
196# Free temporary XML files, if any
197# Should not be necessary if xml_shift was used
198function xml_cleanup ( &$xml ) {
199 global $xmlg ;
200 if ( !$xmlg["use_xml_temp_files"] ) return ; # not using temp files
201 if ( !is_array ( $xml ) ) return false ;
202 foreach ( $xml AS $x ) {
203 unlink ( $x ) ;
204 }
205 $xml = array () ;
206}
207
208
209## MAIN PROGRAM
210
211if ( get_param('doit',false) ) { # Process
212 $wikitext = stripslashes ( get_param('text') ) ;
213
214 if( !defined( 'MEDIAWIKI' ) ) { # Stand-alone
215 $content_provider = new ContentProviderHTTP ;
216 } else { # MediaWiki extension
217 $content_provider = new ContentProviderMySQL ;
218 }
219 $converter = new MediaWikiConverter ;
220
221 $xmlg["useapi"] = isset ( $_REQUEST['useapi'] ) ;
222 $xmlg["book_title"] = get_param('document_title');
223 $xmlg["site_base_url"] = get_param('site') ;
224 $xmlg["resolvetemplates"] = get_param('use_templates','all') ;
225 $xmlg['templates'] = explode ( "\n" , get_param('templates','') ) ;
226 $xmlg['add_gfdl'] = get_param('add_gfdl',false) ;
227 $xmlg['keep_interlanguage'] = get_param('keep_interlanguage',false) ;
228 $xmlg['keep_categories'] = get_param('keep_categories',false) ;
229
230 # the article list
231 $aArticles = array () ;
232
233 $t = microtime_float() ;
234 $xml = "" ;
235
236 $format = get_param('output_format') ;
237 $whatsthis = get_param('whatsthis') ;
238
239 # Catch listnamepage
240 if ( $whatsthis == "listpagename" ) {
241 $listpage = trim ( array_shift ( explode ( "\n" , $wikitext ) ) ) ;
242 $wikitext = $content_provider->get_wiki_text ( $listpage ) ;
243 $lines = explode ( "\n" , $wikitext ) ;
244 $wikitext = array () ;
245 foreach ( $lines AS $l ) {
246 $l1 = substr ( $l , 0 , 1 ) ;
247 if ( $l1 != '*' && $l1 != '#' && $l1 != ':' ) continue ;
248 $l = explode ( '[[' , $l , 2 ) ;
249 $l = trim ( array_shift ( explode ( ']]' , array_pop ( $l ) , 2 ) ) ) ;
250 if ( $l == '' ) continue ;
251 $wikitext[] = $l ;
252 }
253 $wikitext = implode ( "\n" , $wikitext ) ;
254 $whatsthis = 'articlelist' ;
255 }
256
257 # QUICK HACK! NEEDS TO WORK!
258 if ( $format == "odt" || $format == "odt_xml" || $format == "docbook_pdf" || $format == "docbook_html" || $format == "docbook_xml" ) {
259 $xmlg["allow_xml_temp_files"] = false ;
260 }
261
262 if ( $whatsthis == "wikitext" ) {
263 $content_provider->first_title = "Raw wikitext page" ;
264 $wiki2xml_authors = array () ;
265 $xml = $converter->article2xml ( "" , $wikitext , $xmlg ) ;
266 } else {
267 if ( $xmlg['allow_xml_temp_files'] ) $xmlg['use_xml_temp_files'] = true ;
268
269 foreach ( explode ( "\n" , $wikitext ) AS $a ) {
270 push_article( $aArticles, $a );
271 }
272
273 # set the first article name as the default title
274 if ($xmlg["book_title"] == '') {
275 $xmlg["book_title"] = $aArticles[0];
276 }
277 # as long as we have articles to convert (this might change in between!)
278 while ( $a = array_shift( $aArticles ) ) {
279 $wiki2xml_authors = array () ;
280
281 # Article page|Article name
282 $a = explode ( '|' , $a ) ;
283 if ( count ( $a ) == 1 ) $a[] = $a[0] ;
284 $title_page = trim ( array_shift ( $a ) ) ;
285 $title_name = trim ( array_pop ( $a ) ) ;
286
287 $wikitext = $content_provider->get_wiki_text ( $title_page ) ;
288 add_authors ( $content_provider->authors ) ;
289 append_to_xml ( $xml , $converter->article2xml ( $title_name , $wikitext , $xmlg, $aArticles ) ) ;
290 #$xml .= $converter->article2xml ( $title_name , $wikitext , $xmlg, &$aArticles ) ;
291 }
292 }
293 $t = microtime_float() - $t ;
294 $tt = round( $t, 3 ) ;
295 $lt = round( $content_provider->load_time, 3 ) ;
296 $t = round( $t - $lt, 3) ;
297
298 $xmlg['xml_articles_header'] = "<articles xmlns:xhtml=\" \" loadtime='{$lt} sec' rendertime='{$t} sec' totaltime='{$tt} sec'>" ;
299
300 # Output format
301 if ( $format == "xml" ) {
302 header('Content-type: text/xml; charset=utf-8');
303 print "<?xml version='1.0' encoding='UTF-8' ?>\n" ;
304 print xml_articles_header() ;
305 while ( $x = xml_shift ( $xml ) ) print $x ;
306 print "</articles>" ;
307 } else if ( $format == "text" ) {
308 $xmlg['plaintext_markup'] = get_param('plaintext_markup',false) ;
309 $xmlg['plaintext_prelink'] = get_param('plaintext_prelink',false) ;
310 $out = $converter->articles2text ( $xml , $xmlg ) ;
311 $out = str_replace ( "\n" , "<br/>" , $out ) ;
312 header('Content-type: text/html; charset=utf-8');
313 print $out ;
314
315 } else if ( $format == "translated_text" ) {
316 $xmlg['plaintext_markup'] = false ;
317 $xmlg['plaintext_prelink'] = false ;
318 $out = $converter->articles2text ( $xml , $xmlg ) ;
319 #$out = str_replace ( "\n" , "<br/>" , $out ) ;
320 #header('Content-type: text/html; charset=utf-8');
321 #print $out ;
322 $out = explode ( "\n" , $out ) ;
323 array_shift ( $out ) ;
324 $out = trim ( implode ( "\n" , $out ) ) ;
325 $source_language = array_shift ( explode ( '.' , $xmlg["site_base_url"] ) ) ;
326 $target_language = get_param ( 'translated_text_target_language' , 'en' ) ;
327 $langpair = urlencode ( "{$source_language}|{$target_language}" ) ;
328 $url = "http://www.google.com/translate_t?langpair={$langpair}&text=" . urlencode ( utf8_decode ( $out ) ) ;
329 echo file_get_contents ( $url ) ;
330
331 } else if ( $format == "xhtml" ) {
332 $xmlg['xhtml_justify'] = get_param ( 'xhtml_justify' , false ) ;
333 $xmlg['xhtml_logical_markup'] = get_param ( 'xhtml_logical_markup' , false ) ;
334 $xmlg['xhtml_source'] = get_param ( 'xhtml_source' , false ) ;
335
336 if ( $xmlg['xhtml_source'] ) {
337 header('Content-type: text/xml; charset=utf-8');
338 #header('Content-type: text/html; charset=utf-8');
339 $s = $converter->articles2xhtml ( $xml , $xmlg ) ;
340 $s = str_replace ( '>' , ">\n" , $s ) ;
341 $s = str_replace ( '<' , "\n<" , $s ) ;
342 $s = str_replace ( "\n\n" , "\n" , $s ) ;
343 echo trim ( $s ) ;
344 #echo str_replace ( "\n" , '<br/>' , htmlentities ( trim ( $s ) ) ) ;
345 } else {
346 # Header hack for IE
347 if ( stristr($_SERVER["HTTP_ACCEPT"],"application/xhtml+xml") ) {
348 header("Content-type: application/xhtml+xml");
349 } else {
350 header("Content-type: text/html");
351 }
352 echo $converter->articles2xhtml ( $xml , $xmlg ) ;
353 }
354
355 } else if ( $format == "odt" || $format == "odt_xml" ) {
356 if ( isset ( $_REQUEST['odt_footnote'] ) ) $xmlg["odt_footnote"] = 'endnote' ;
357 if ( $xmlg['sourcedir'] == '.' ) $cwd = getcwd() ;
358 else $cwd = $xmlg['sourcedir'] ;
359 $template_file = $cwd . '/template.odt' ;
360
361 $dir_file = tempnam($xmlg["temp_dir"], "ODD");
362 $dir = $dir_file . "-DIR" ;
363 $xmlg['image_destination'] = $dir . "/Pictures" ;
364
365 $zipdir = $cwd ;
366 if ( isset ( $xmlg["zip_odt_path"] ) ) # Windows strange bug workaround
367 $zipdir = $xmlg["zip_odt_path"] ;
368
369 chdir ( $zipdir ) ;
370
371 # Unzip template
372 $cmd = $xmlg['unzip_odt'] ;
373 $cmd = str_replace ( '$1' , escapeshellarg ( $template_file ) , $cmd ) ;
374 $cmd = str_replace ( '$2' , escapeshellarg ( $dir ) , $cmd ) ;
375 exec ( $cmd ) ;
376
377 # Convert XML to ODT
378 chdir ( $cwd ) ;
379 if ( $format == "odt_xml" ) $content_provider->block_file_download = true ;
380 $out = $converter->articles2odt ( $xml , $xmlg ) ;
381 chdir ( $zipdir ) ;
382
383 # Create ODT structure
384 $handle = fopen ( $dir . "/content.xml" , "w" ) ;
385 if ($handle) {
386 fwrite ( $handle , $out ) ;
387 fclose ( $handle ) ;
388 # Generate temporary ODT file
389 $out_file = tempnam('', "ODT");
390 $cmd = $xmlg['zip_odt'] ;
391 $cmd = str_replace ( '$1' , escapeshellarg ( $out_file ) , $cmd ) ;
392
393 if ( $xmlg['is_windows'] ) {
394 $cmd = str_replace ( '$2' , escapeshellarg ( $dir . "/" ) , $cmd ) ;
395 } else {
396 $cmd = str_replace ( '$2' , escapeshellarg ( './' ) , $cmd ) ;
397 # linux/unix zip needs to be in the directory, otherwise it will
398 # include needless parts into the directory structure
399 chdir ($dir);
400 # remove the output if it for some reason already exists
401 }
402
403 @unlink ( $out_file ) ;
404 exec ( $cmd ) ;
405
406 if ( $format == "odt" ) { # Return ODT file
407 $filename = $xmlg["book_title"] ;
408 if (!preg_match('/\.[a-zA-Z]{3}$/',$filename)) { $filename .= '.odt'; }
409 if (!preg_match('/\.[a-zA-Z]{3}$/',$out_file)) { $out_file .= '.zip'; }
410 header('Content-type: application/vnd.oasis.opendocument.text; charset=utf-8');
411 header('Content-Disposition: inline; filename="'.$filename.'"');
412 # XXX TODO: error handling here
413 $handle = fopen($out_file, 'rb');
414 fpassthru ( $handle ) ;
415 fclose ( $handle ) ;
416 } else { # Return XML
417 header('Content-type: text/xml; charset=utf-8');
418 print str_replace ( ">" , ">\n" , $out ) ;
419 }
420
421 # Cleanup
422 SureRemoveDir ( $dir ) ;
423 @rmdir ( $dir ) ;
424 @unlink ( $dir_file ) ;
425 @unlink ( $out_file ) ;
426 chdir ( $cwd ) ;
427 } # error occured
428
429 } else if ( $format == "docbook_xml" ) {
430 $out = $converter->articles2docbook_xml ( $xml , $xmlg ) ;
431 header('Content-type: text/xml; charset=utf-8');
432 print $out ;
433 } else if ( $format == "docbook_pdf" || $format == "docbook_html" ) {
434 $filetype = substr ( $format , 8 ) ;
435 $filename = $converter->articles2docbook_pdf ( $xml , $xmlg , strtoupper ( $filetype ) ) ;
436
437 if ( file_exists ( $filename ) ) {
438 $fp = fopen($filename, 'rb');
439 if ( $format == "docbook_pdf" ) {
440 header('Content-Type: application/pdf');
441 header("Content-Length: " . (string) filesize($filename));
442 header('Content-Disposition: attachment; filename="'.$xmlg["book_title"].'.pdf"');
443 } else if ( $format == "docbook_html" ) {
444 header('Content-Type: text/html');
445 header("Content-Length: " . (string) filesize($filename));
446 header('Content-Disposition: inline; filename="'.$xmlg["book_title"].'.html"');
447 }
448 fpassthru($fp);
449 fclose ( $fp ) ;
450 }
451
452 # Cleanup
453 $pdf_dir = dirname ( dirname ( $filename ) ) ;
454 SureRemoveDir ( $pdf_dir ) ;
455 @rmdir ( $pdf_dir ) ;
456 }
457 xml_cleanup ( $xml ) ;
458 exit ;
459} else { # Show the form
460 if( !defined( 'MEDIAWIKI' ) ) { # Stand-alone
461 header('Content-type: text/html; charset=utf-8');
462 print "
463<html><head></head><body>
464<h1>Magnus' magic MediaWiki-to-XML-to-stuff converter</h1>
465<p>All written in PHP - so portable, <s>so incredibly slow...</s> <i>about as fast as the original MediaWiki parser!</i> <small>(For the source, see <a href='http://www.mediawiki.org/wiki/Subversion'>here</a>, trunk 'wiki2xml', directory 'php')</small></p>" ;
466 $xmlg["useapi"] = 1 ;
467 print get_form () ;
468 print "</body></html>" ;
469 } else { # MediaWiki extension
470 $out = get_form ( true ) ;
471 }
472
473}
474
475#<input type='checkbox' name='resolvetemplates' value='1' checked>Automatically resolve templates</input><br/>
476
477?>
Note: See TracBrowser for help on using the repository browser.