Context Navigation

w2x.php

Last change on this file was 25141, checked in by papitha, 12 years ago
NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!
File size: 17.5 KB

Line
1	<?php
2	# Copyright by Magnus Manske (2005 - 2006)
3	# Released under GPL
4
5	@set_time_limit ( 0 ) ; # No time limit
6	ini_set ('memory_limit', 1024102464); # 64MB upper limit
7	ini_set('user_agent','MSIE 4\.0b2;'); # Fake user agent
8
9	if( !defined( 'MEDIAWIKI' ) ) { # Stand-alone
10	include_once ( "default.php" ) ; # Which will include local.php, if available
11	}
12
13	require_once ( "mediawiki_converter.php" ) ;
14
15	## TIMER FUNCTION
16
17	function microtime_float()
18	{
19	list($usec, $sec) = explode(" ", microtime());
20	return ((float)$usec + (float)$sec);
21	}
22
23	function get_form ( $as_extension = false ) {
24	global $xmlg ;
25
26	$_wt = get_param ( 'whatsthis' , 'articlelist' ) ;
27	$wt['wikitext'] = $wt['articlelist'] = $wt['listpagename'] = "" ;
28	$wt[$_wt] = 'checked' ;
29
30	$xmlg["site_base_url"] = get_param('site',$xmlg["site_base_url"]) ;
31	$_out = get_param ( 'output_format' , 'xml' ) ;
32	$out['xml'] = $out['text'] = $out['translated_text'] = $out['xhtml'] = $out['docbook_xml'] = $out['docbook_pdf'] = $out['docbook_html'] =
33	$out['odt_xml'] = $out['odt'] = '' ;
34	$out[$_out] = 'checked' ;
35	$useapi_checked = $xmlg["useapi"] ? 'checked' : '' ;
36
37	$optional = array () ;
38	if ( isset ( $xmlg['docbook']['command_pdf'] ) ) {
39	$optional[] = "<INPUT {$out['docbook_pdf']} type='radio' name='output_format' value='docbook_pdf'>DocBook PDF" ;
40	}
41	if ( isset ( $xmlg['docbook']['command_html'] ) ) {
42	$optional[] = "<INPUT {$out['docbook_html']} type='radio' name='output_format' value='docbook_html'>DocBook HTML" ;
43	}
44	if ( isset ( $xmlg['zip_odt'] ) ) {
45	$optional[] = "<INPUT {$out['odt_xml']} type='radio' name='output_format' value='odt_xml'>OpenOffice XML" ;
46	$optional[] = "<INPUT {$out['odt']} type='radio' name='output_format' value='odt'>OpenOffice ODT" .
47	"<input type='checkbox' name='odt_footnote' value='1' ".(get_param('odt_footnote',false)?'checked':'').">References as endnotes (instead of footnotes)" ;
48	}
49	$optional = "<br/>" . implode ( "<br/>" , $optional ) ;
50
51	# dub sez... working images
52	if ( $as_extension ) $site = "<input type='hidden' name='site' value='".$xmlg["site_base_url"]."'/>" ;
53	else $site = "Site : http://<input type='text' name='site' value='".$xmlg["site_base_url"]."'/>/index.php<br/>" ;
54
55	$additional = array() ;
56	if ( $xmlg['allow_get'] ) {
57	$additional[] = "This page can be called with parameters: w2x.php?doit=1&whatsthis=articlelist&site=en.wikipedia.org/w&output_format=odt&text=Biochemistry" ;
58	$additional[] = "For additional parameters, see <a href='README'>here</a>" ;
59	}
60
61	# Plain text translation options
62	$a = array (
63	'en' => 'English',
64	'de' => 'German',
65	'fr' => 'French',
66	'es' => 'Spanish',
67	'it' => 'Italian',
68	) ;
69	asort ( $a ) ;
70	$tttlo = "" ;
71	foreach ( $a AS $b => $c ) {
72	$tttlo .= "<option value='{$b}'>{$c}</option>" ;
73	}
74
75	$additional = "<div style='text-align:center; border-top:1px solid black;width:100%;font-size:12px'>" .
76	implode ( "<br/>" , $additional ) .
77	"</div>" ;
78
79	return "<form method='post'>
80	<h2>Paste article list or wikitext here</h2>
81	<table border='0' width='100%'><tr>
82	<td valign='top'><textarea rows='20' cols='80' style='width:100%' name='text'>" .
83	get_param ( 'text' , '' ) .
84	"</textarea></td>
85	<td width='200px' valign='top' nowrap>
86	<INPUT checked type='radio' name='use_templates' value='all'>Use all templates<br/>
87	<INPUT type='radio' name='use_templates' value='none'>Do not use templates<br/>
88	<INPUT type='radio' name='use_templates' value='these'>Use these templates<br/>
89	<INPUT type='radio' name='use_templates' value='notthese'>Use all but these templates<br/>
90	<textarea rows='15' cols='30' style='width:100%' name='templates'></textarea>
91	</td></tr></table>
92	<table border='0'><tr>
93	<td valign='top'>
94	This is<br/>
95	<INPUT {$wt['wikitext']} type='radio' name='whatsthis' value='wikitext'>raw wikitext <br/>
96	<INPUT {$wt['articlelist']} type='radio' name='whatsthis' value='articlelist'>a list of articles<br/>
97	<INPUT {$wt['listpagename']} type='radio' name='whatsthis' value='listpagename'>the name of an article with a list of pages<br/>
98
99	{$site}
100	<input type='checkbox' name='useapi' value='1' id='useapi' $useapi_checked /><label for='useapi'>Use API</label><br/>
101	Title : <input type='text' name='document_title' value='' size=40/><br/>
102	<input type='checkbox' name='add_gfdl' value='1' checked>Include GFDL (for some output formats)</input><br/>
103	<input type='checkbox' name='keep_categories' value='1' checked>Keep categories</input><br/>
104	<input type='checkbox' name='keep_interlanguage' value='1' checked>Keep interlanguage links</input><br/>
105	<input type='submit' name='doit' value='Convert'/>
106	</td><td valign='top' style='border-left:1px black solid'>
107	<b>Output</b>
108	<br/><INPUT {$out['xml']} type='radio' name='output_format' value='xml'>XML
109	<br/><INPUT {$out['text']} type='radio' name='output_format' value='text'>Plain text
110	<input type='checkbox' name='plaintext_markup' value='1' ".(get_param('plaintext_markup',true)?'checked':'').">Use *_/ markup</input>
111	<input type='checkbox' name='plaintext_prelink' value='1' ".(get_param('plaintext_prelink',true)?'checked':'').">Put → before internal links</input>
112	<br/><INPUT {$out['translated_text']} type='radio' name='output_format' value='translated_text'>Plain text, google-translated to
113	<select name='translated_text_target_language'>{$tttlo}</select> (works only for wikipedia/wikibooks)
114	<br/><INPUT {$out['xhtml']} type='radio' name='output_format' value='xhtml'>XHTML
115	<input type='checkbox' name='xhtml_justify' value='1' ".(get_param('xhtml_justify',true)?'checked':'').">Align paragraphs as 'justify'</input>
116	<input type='checkbox' name='xhtml_logical_markup' value='1' ".(get_param('xhtml_logical_markup',true)?'checked':'').">Use logical markup (e.g., 'strong' instead of 'b')</input>
117	<input type='checkbox' name='xhtml_source' value='1' ".(get_param('xhtml_source',false)?'checked':'').">Return source XHTML</input>
118	<br/><INPUT {$out['docbook_xml']} type='radio' name='output_format' value='docbook_xml'>DocBook XML
119	{$optional}
120	</tr></table>
121	</form>
122	<p>
123	Known issues:
124	<ul>
125	<li>In templates, {{{variables}}} used within <nowiki> tags will be replaced as well (too lazy to strip them)</li>
126	<li>HTML comments are removed (instead of converted into XML tags)</li>
127	</ul>{$additional}
128	</p>" ;
129	}
130
131	function get_param ( $s , $default = NULL ) {
132	global $xmlg ;
133	if ( $xmlg['allow_get'] ) {
134	if ( isset ( $_REQUEST[$s] ) ) {
135	return $_REQUEST[$s] ;
136	} else {
137	return $default ;
138	}
139	} else {
140	if ( isset ( $_POST[$s] ) ) {
141	return $_POST[$s] ;
142	} else {
143	return $default ;
144	}
145	}
146	}
147
148	# add one article to the stack of to-be-converted articles
149	function push_article ( &$aArticles, $article ) {
150
151	# convert _ to ' '
152	$a = trim( $article );
153	if ( $a != "" ) {
154	$aArticles[] = preg_replace( '/_/', ' ', $a );
155	}
156
157	}
158
159	# Append XML, or links to XML temporary files
160	function append_to_xml ( &$xml , $new_xml ) {
161	global $xmlg ;
162	if ( $xmlg["use_xml_temp_files"] ) { # Use temp files
163	if ( !is_array ( $xml ) ) $xml = array () ;
164
165	do {
166	$tmp_file_name = tempnam ( $xmlg["temp_dir"] , "XMLTMP" ) ;
167	$tmp_file = fopen($tmp_file_name, 'wb') ;
168	} while ( $tmp_file === false ) ;
169	fwrite ( $tmp_file , $new_xml ) ;
170	fclose ( $tmp_file ) ;
171
172	$xml[] = $tmp_file_name ;
173	} else { # Do not use temp files
174	$xml .= $new_xml ;
175	}
176	}
177
178	# Returns the next article XML, or false
179	function xml_shift ( &$xml ) {
180	if ( !is_array ( $xml ) ) { # Do not use temp files
181	if ( $xml != '' ) {
182	$x = $xml ;
183	$xml = array () ;
184	return $x ;
185	}
186	return false ;
187	} else { # Use temp files
188	if ( count ( $xml ) == 0 ) return false ;
189	$x = array_shift ( $xml ) ;
190	$ret = file_get_contents ( $x ) ;
191	unlink ( $x ) ;
192	return $ret ;
193	}
194	}
195
196	# Free temporary XML files, if any
197	# Should not be necessary if xml_shift was used
198	function xml_cleanup ( &$xml ) {
199	global $xmlg ;
200	if ( !$xmlg["use_xml_temp_files"] ) return ; # not using temp files
201	if ( !is_array ( $xml ) ) return false ;
202	foreach ( $xml AS $x ) {
203	unlink ( $x ) ;
204	}
205	$xml = array () ;
206	}
207
208
209	## MAIN PROGRAM
210
211	if ( get_param('doit',false) ) { # Process
212	$wikitext = stripslashes ( get_param('text') ) ;
213
214	if( !defined( 'MEDIAWIKI' ) ) { # Stand-alone
215	$content_provider = new ContentProviderHTTP ;
216	} else { # MediaWiki extension
217	$content_provider = new ContentProviderMySQL ;
218	}
219	$converter = new MediaWikiConverter ;
220
221	$xmlg["useapi"] = isset ( $_REQUEST['useapi'] ) ;
222	$xmlg["book_title"] = get_param('document_title');
223	$xmlg["site_base_url"] = get_param('site') ;
224	$xmlg["resolvetemplates"] = get_param('use_templates','all') ;
225	$xmlg['templates'] = explode ( "\n" , get_param('templates','') ) ;
226	$xmlg['add_gfdl'] = get_param('add_gfdl',false) ;
227	$xmlg['keep_interlanguage'] = get_param('keep_interlanguage',false) ;
228	$xmlg['keep_categories'] = get_param('keep_categories',false) ;
229
230	# the article list
231	$aArticles = array () ;
232
233	$t = microtime_float() ;
234	$xml = "" ;
235
236	$format = get_param('output_format') ;
237	$whatsthis = get_param('whatsthis') ;
238
239	# Catch listnamepage
240	if ( $whatsthis == "listpagename" ) {
241	$listpage = trim ( array_shift ( explode ( "\n" , $wikitext ) ) ) ;
242	$wikitext = $content_provider->get_wiki_text ( $listpage ) ;
243	$lines = explode ( "\n" , $wikitext ) ;
244	$wikitext = array () ;
245	foreach ( $lines AS $l ) {
246	$l1 = substr ( $l , 0 , 1 ) ;
247	if ( $l1 != '*' && $l1 != '#' && $l1 != ':' ) continue ;
248	$l = explode ( '[[' , $l , 2 ) ;
249	$l = trim ( array_shift ( explode ( ']]' , array_pop ( $l ) , 2 ) ) ) ;
250	if ( $l == '' ) continue ;
251	$wikitext[] = $l ;
252	}
253	$wikitext = implode ( "\n" , $wikitext ) ;
254	$whatsthis = 'articlelist' ;
255	}
256
257	# QUICK HACK! NEEDS TO WORK!
258	if ( $format == "odt" \|\| $format == "odt_xml" \|\| $format == "docbook_pdf" \|\| $format == "docbook_html" \|\| $format == "docbook_xml" ) {
259	$xmlg["allow_xml_temp_files"] = false ;
260	}
261
262	if ( $whatsthis == "wikitext" ) {
263	$content_provider->first_title = "Raw wikitext page" ;
264	$wiki2xml_authors = array () ;
265	$xml = $converter->article2xml ( "" , $wikitext , $xmlg ) ;
266	} else {
267	if ( $xmlg['allow_xml_temp_files'] ) $xmlg['use_xml_temp_files'] = true ;
268
269	foreach ( explode ( "\n" , $wikitext ) AS $a ) {
270	push_article( $aArticles, $a );
271	}
272
273	# set the first article name as the default title
274	if ($xmlg["book_title"] == '') {
275	$xmlg["book_title"] = $aArticles[0];
276	}
277	# as long as we have articles to convert (this might change in between!)
278	while ( $a = array_shift( $aArticles ) ) {
279	$wiki2xml_authors = array () ;
280
281	# Article page\|Article name
282	$a = explode ( '\|' , $a ) ;
283	if ( count ( $a ) == 1 ) $a[] = $a[0] ;
284	$title_page = trim ( array_shift ( $a ) ) ;
285	$title_name = trim ( array_pop ( $a ) ) ;
286
287	$wikitext = $content_provider->get_wiki_text ( $title_page ) ;
288	add_authors ( $content_provider->authors ) ;
289	append_to_xml ( $xml , $converter->article2xml ( $title_name , $wikitext , $xmlg, $aArticles ) ) ;
290	#$xml .= $converter->article2xml ( $title_name , $wikitext , $xmlg, &$aArticles ) ;
291	}
292	}
293	$t = microtime_float() - $t ;
294	$tt = round( $t, 3 ) ;
295	$lt = round( $content_provider->load_time, 3 ) ;
296	$t = round( $t - $lt, 3) ;
297
298	$xmlg['xml_articles_header'] = "<articles xmlns:xhtml=\" \" loadtime='{$lt} sec' rendertime='{$t} sec' totaltime='{$tt} sec'>" ;
299
300	# Output format
301	if ( $format == "xml" ) {
302	header('Content-type: text/xml; charset=utf-8');
303	print "<?xml version='1.0' encoding='UTF-8' ?>\n" ;
304	print xml_articles_header() ;
305	while ( $x = xml_shift ( $xml ) ) print $x ;
306	print "</articles>" ;
307	} else if ( $format == "text" ) {
308	$xmlg['plaintext_markup'] = get_param('plaintext_markup',false) ;
309	$xmlg['plaintext_prelink'] = get_param('plaintext_prelink',false) ;
310	$out = $converter->articles2text ( $xml , $xmlg ) ;
311	$out = str_replace ( "\n" , "<br/>" , $out ) ;
312	header('Content-type: text/html; charset=utf-8');
313	print $out ;
314
315	} else if ( $format == "translated_text" ) {
316	$xmlg['plaintext_markup'] = false ;
317	$xmlg['plaintext_prelink'] = false ;
318	$out = $converter->articles2text ( $xml , $xmlg ) ;
319	#$out = str_replace ( "\n" , "<br/>" , $out ) ;
320	#header('Content-type: text/html; charset=utf-8');
321	#print $out ;
322	$out = explode ( "\n" , $out ) ;
323	array_shift ( $out ) ;
324	$out = trim ( implode ( "\n" , $out ) ) ;
325	$source_language = array_shift ( explode ( '.' , $xmlg["site_base_url"] ) ) ;
326	$target_language = get_param ( 'translated_text_target_language' , 'en' ) ;
327	$langpair = urlencode ( "{$source_language}\|{$target_language}" ) ;
328	$url = "http://www.google.com/translate_t?langpair={$langpair}&text=" . urlencode ( utf8_decode ( $out ) ) ;
329	echo file_get_contents ( $url ) ;
330
331	} else if ( $format == "xhtml" ) {
332	$xmlg['xhtml_justify'] = get_param ( 'xhtml_justify' , false ) ;
333	$xmlg['xhtml_logical_markup'] = get_param ( 'xhtml_logical_markup' , false ) ;
334	$xmlg['xhtml_source'] = get_param ( 'xhtml_source' , false ) ;
335
336	if ( $xmlg['xhtml_source'] ) {
337	header('Content-type: text/xml; charset=utf-8');
338	#header('Content-type: text/html; charset=utf-8');
339	$s = $converter->articles2xhtml ( $xml , $xmlg ) ;
340	$s = str_replace ( '>' , ">\n" , $s ) ;
341	$s = str_replace ( '<' , "\n<" , $s ) ;
342	$s = str_replace ( "\n\n" , "\n" , $s ) ;
343	echo trim ( $s ) ;
344	#echo str_replace ( "\n" , '<br/>' , htmlentities ( trim ( $s ) ) ) ;
345	} else {
346	# Header hack for IE
347	if ( stristr($_SERVER["HTTP_ACCEPT"],"application/xhtml+xml") ) {
348	header("Content-type: application/xhtml+xml");
349	} else {
350	header("Content-type: text/html");
351	}
352	echo $converter->articles2xhtml ( $xml , $xmlg ) ;
353	}
354
355	} else if ( $format == "odt" \|\| $format == "odt_xml" ) {
356	if ( isset ( $_REQUEST['odt_footnote'] ) ) $xmlg["odt_footnote"] = 'endnote' ;
357	if ( $xmlg['sourcedir'] == '.' ) $cwd = getcwd() ;
358	else $cwd = $xmlg['sourcedir'] ;
359	$template_file = $cwd . '/template.odt' ;
360
361	$dir_file = tempnam($xmlg["temp_dir"], "ODD");
362	$dir = $dir_file . "-DIR" ;
363	$xmlg['image_destination'] = $dir . "/Pictures" ;
364
365	$zipdir = $cwd ;
366	if ( isset ( $xmlg["zip_odt_path"] ) ) # Windows strange bug workaround
367	$zipdir = $xmlg["zip_odt_path"] ;
368
369	chdir ( $zipdir ) ;
370
371	# Unzip template
372	$cmd = $xmlg['unzip_odt'] ;
373	$cmd = str_replace ( '$1' , escapeshellarg ( $template_file ) , $cmd ) ;
374	$cmd = str_replace ( '$2' , escapeshellarg ( $dir ) , $cmd ) ;
375	exec ( $cmd ) ;
376
377	# Convert XML to ODT
378	chdir ( $cwd ) ;
379	if ( $format == "odt_xml" ) $content_provider->block_file_download = true ;
380	$out = $converter->articles2odt ( $xml , $xmlg ) ;
381	chdir ( $zipdir ) ;
382
383	# Create ODT structure
384	$handle = fopen ( $dir . "/content.xml" , "w" ) ;
385	if ($handle) {
386	fwrite ( $handle , $out ) ;
387	fclose ( $handle ) ;
388	# Generate temporary ODT file
389	$out_file = tempnam('', "ODT");
390	$cmd = $xmlg['zip_odt'] ;
391	$cmd = str_replace ( '$1' , escapeshellarg ( $out_file ) , $cmd ) ;
392
393	if ( $xmlg['is_windows'] ) {
394	$cmd = str_replace ( '$2' , escapeshellarg ( $dir . "/" ) , $cmd ) ;
395	} else {
396	$cmd = str_replace ( '$2' , escapeshellarg ( './' ) , $cmd ) ;
397	# linux/unix zip needs to be in the directory, otherwise it will
398	# include needless parts into the directory structure
399	chdir ($dir);
400	# remove the output if it for some reason already exists
401	}
402
403	@unlink ( $out_file ) ;
404	exec ( $cmd ) ;
405
406	if ( $format == "odt" ) { # Return ODT file
407	$filename = $xmlg["book_title"] ;
408	if (!preg_match('/\.[a-zA-Z]{3}$/',$filename)) { $filename .= '.odt'; }
409	if (!preg_match('/\.[a-zA-Z]{3}$/',$out_file)) { $out_file .= '.zip'; }
410	header('Content-type: application/vnd.oasis.opendocument.text; charset=utf-8');
411	header('Content-Disposition: inline; filename="'.$filename.'"');
412	# XXX TODO: error handling here
413	$handle = fopen($out_file, 'rb');
414	fpassthru ( $handle ) ;
415	fclose ( $handle ) ;
416	} else { # Return XML
417	header('Content-type: text/xml; charset=utf-8');
418	print str_replace ( ">" , ">\n" , $out ) ;
419	}
420
421	# Cleanup
422	SureRemoveDir ( $dir ) ;
423	@rmdir ( $dir ) ;
424	@unlink ( $dir_file ) ;
425	@unlink ( $out_file ) ;
426	chdir ( $cwd ) ;
427	} # error occured
428
429	} else if ( $format == "docbook_xml" ) {
430	$out = $converter->articles2docbook_xml ( $xml , $xmlg ) ;
431	header('Content-type: text/xml; charset=utf-8');
432	print $out ;
433	} else if ( $format == "docbook_pdf" \|\| $format == "docbook_html" ) {
434	$filetype = substr ( $format , 8 ) ;
435	$filename = $converter->articles2docbook_pdf ( $xml , $xmlg , strtoupper ( $filetype ) ) ;
436
437	if ( file_exists ( $filename ) ) {
438	$fp = fopen($filename, 'rb');
439	if ( $format == "docbook_pdf" ) {
440	header('Content-Type: application/pdf');
441	header("Content-Length: " . (string) filesize($filename));
442	header('Content-Disposition: attachment; filename="'.$xmlg["book_title"].'.pdf"');
443	} else if ( $format == "docbook_html" ) {
444	header('Content-Type: text/html');
445	header("Content-Length: " . (string) filesize($filename));
446	header('Content-Disposition: inline; filename="'.$xmlg["book_title"].'.html"');
447	}
448	fpassthru($fp);
449	fclose ( $fp ) ;
450	}
451
452	# Cleanup
453	$pdf_dir = dirname ( dirname ( $filename ) ) ;
454	SureRemoveDir ( $pdf_dir ) ;
455	@rmdir ( $pdf_dir ) ;
456	}
457	xml_cleanup ( $xml ) ;
458	exit ;
459	} else { # Show the form
460	if( !defined( 'MEDIAWIKI' ) ) { # Stand-alone
461	header('Content-type: text/html; charset=utf-8');
462	print "
463	<html><head></head><body>
464	<h1>Magnus' magic MediaWiki-to-XML-to-stuff converter</h1>
465	<p>All written in PHP - so portable, <s>so incredibly slow...</s> <i>about as fast as the original MediaWiki parser!</i> <small>(For the source, see <a href='http://www.mediawiki.org/wiki/Subversion'>here</a>, trunk 'wiki2xml', directory 'php')</small></p>" ;
466	$xmlg["useapi"] = 1 ;
467	print get_form () ;
468	print "</body></html>" ;
469	} else { # MediaWiki extension
470	$out = get_form ( true ) ;
471	}
472
473	}
474
475	#<input type='checkbox' name='resolvetemplates' value='1' checked>Automatically resolve templates</input><br/>
476
477	?>

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/w2x.php

Download in other formats: