1 | <!--
|
---|
2 | Converts Wikipedia articles in wiki format into an XML format. It might
|
---|
3 | segfault or go into an "infinite" loop sometimes.
|
---|
4 |
|
---|
5 | Evan Jones <[email protected]>
|
---|
6 | April, 2008
|
---|
7 | Released under a BSD licence.
|
---|
8 | http://evanjones.ca/software/wikipedia2text.html
|
---|
9 | -->
|
---|
10 |
|
---|
11 | <?php
|
---|
12 | error_reporting(E_ALL);
|
---|
13 | require_once("mediawiki_converter.php");
|
---|
14 |
|
---|
15 | if (count($argv) != 3) {
|
---|
16 | echo "wiki2xml_command [input wikitext] [output wiki XML]\n";
|
---|
17 | exit(1);
|
---|
18 | }
|
---|
19 |
|
---|
20 | $filename = $argv[1];
|
---|
21 | $wikitext = file_get_contents($filename);
|
---|
22 | if (strlen($wikitext) == 0) {
|
---|
23 | echo "Bad input file\n";
|
---|
24 | exit(1);
|
---|
25 | }
|
---|
26 |
|
---|
27 | $filename_parts = explode("/", $filename);
|
---|
28 | $title = $filename_parts[count($filename_parts)-1];
|
---|
29 | $title = str_replace(".txt", "", $title);
|
---|
30 | $title = urldecode($title);
|
---|
31 |
|
---|
32 | // Configures options for converting to XML
|
---|
33 | $xmlg = array();
|
---|
34 | $xmlg["usetemplates"] = "none";
|
---|
35 | $xmlg["resolvetemplates"] = "none";
|
---|
36 | $xmlg["templates"] = array();
|
---|
37 | $xmlg['add_gfdl'] = false;
|
---|
38 | $xmlg['keep_interlanguage'] = false;
|
---|
39 | $xmlg['keep_categories'] = false;
|
---|
40 | $xmlg['text_hide_images'] = true;
|
---|
41 | $xmlg['text_hide_tables'] = true;
|
---|
42 | $xmlg["useapi"] = false;
|
---|
43 | $xmlg["xml_articles_header"] = "<articles>";
|
---|
44 |
|
---|
45 | // No idea what it does, but it makes it work
|
---|
46 | $content_provider = new ContentProviderHTTP;
|
---|
47 |
|
---|
48 | $converter = new MediaWikiConverter;
|
---|
49 | $xml = $converter->article2xml($title, $wikitext , $xmlg);
|
---|
50 |
|
---|
51 | // To convert to plain text:
|
---|
52 | //~ require_once("xml2tree.php");
|
---|
53 | //~ require_once("xml2txt.php");
|
---|
54 | //~ $x2t = new xml2php ;
|
---|
55 | //~ $tree = $x2t->scanString($xml);
|
---|
56 | //~ $text = trim($tree->parse($tree));
|
---|
57 |
|
---|
58 | file_put_contents($argv[2], $xml);
|
---|
59 | ?>
|
---|