1 | <!--
|
---|
2 | Converts Wikipedia articles in wiki format into an XML format. It might
|
---|
3 | segfault or go into an "infinite" loop sometimes.
|
---|
4 |
|
---|
5 | Evan Jones <[email protected]>
|
---|
6 | April, 2008
|
---|
7 | Released under a BSD licence.
|
---|
8 | http://evanjones.ca/software/wikipedia2text.html
|
---|
9 | -->
|
---|
10 |
|
---|
11 | <?php
|
---|
12 | error_reporting(E_ALL);
|
---|
13 | require_once("mediawiki_converter.php");
|
---|
14 |
|
---|
15 | $stdin = fopen('php://stdin', 'r');
|
---|
16 | while (1)
|
---|
17 | {
|
---|
18 | $file = fgets($stdin);
|
---|
19 | $file = chop($file);
|
---|
20 |
|
---|
21 | if (strcmp($file, "") == 0)
|
---|
22 | {
|
---|
23 | break;
|
---|
24 | }
|
---|
25 |
|
---|
26 | $wikitext = file_get_contents($file);
|
---|
27 | if (strlen($wikitext) > 0) {
|
---|
28 | echo "$file\n";
|
---|
29 | $filename_parts = explode("/", $file);
|
---|
30 | $title = $filename_parts[count($filename_parts)-1];
|
---|
31 | $title = str_replace(".txt", "", $title);
|
---|
32 | $title = urldecode($title);
|
---|
33 |
|
---|
34 | // Configures options for converting to XML
|
---|
35 | $xmlg = array();
|
---|
36 | $xmlg["usetemplates"] = "none";
|
---|
37 | $xmlg["resolvetemplates"] = "none";
|
---|
38 | $xmlg["templates"] = array();
|
---|
39 | $xmlg['add_gfdl'] = false;
|
---|
40 | $xmlg['keep_interlanguage'] = false;
|
---|
41 | $xmlg['keep_categories'] = false;
|
---|
42 | $xmlg['text_hide_images'] = true;
|
---|
43 | $xmlg['text_hide_tables'] = true;
|
---|
44 | $xmlg["useapi"] = false;
|
---|
45 | $xmlg["xml_articles_header"] = "<articles>";
|
---|
46 |
|
---|
47 | // No idea what it does, but it makes it work
|
---|
48 | $content_provider = new ContentProviderHTTP;
|
---|
49 |
|
---|
50 | $converter = new MediaWikiConverter;
|
---|
51 | $xml = $converter->article2xml($title, $wikitext , $xmlg);
|
---|
52 | file_put_contents(str_replace('.txt', '.xml', $file), $xml);
|
---|
53 | echo str_replace('.txt', '.xml', $file) . "\n";
|
---|
54 | }
|
---|
55 | }
|
---|
56 |
|
---|
57 | exit(1);
|
---|
58 | ?>
|
---|