source: gs2-extensions/ngramj/src/wiki/wiki2xml/php/wiki2xml_command.php@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 1.5 KB
Line 
1<!--
2Converts Wikipedia articles in wiki format into an XML format. It might
3segfault or go into an "infinite" loop sometimes.
4
5Evan Jones <[email protected]>
6April, 2008
7Released under a BSD licence.
8http://evanjones.ca/software/wikipedia2text.html
9-->
10
11<?php
12error_reporting(E_ALL);
13require_once("mediawiki_converter.php");
14
15if (count($argv) != 3) {
16 echo "wiki2xml_command [input wikitext] [output wiki XML]\n";
17 exit(1);
18}
19
20$filename = $argv[1];
21$wikitext = file_get_contents($filename);
22if (strlen($wikitext) == 0) {
23 echo "Bad input file\n";
24 exit(1);
25}
26
27$filename_parts = explode("/", $filename);
28$title = $filename_parts[count($filename_parts)-1];
29$title = str_replace(".txt", "", $title);
30$title = urldecode($title);
31
32// Configures options for converting to XML
33$xmlg = array();
34$xmlg["usetemplates"] = "none";
35$xmlg["resolvetemplates"] = "none";
36$xmlg["templates"] = array();
37$xmlg['add_gfdl'] = false;
38$xmlg['keep_interlanguage'] = false;
39$xmlg['keep_categories'] = false;
40$xmlg['text_hide_images'] = true;
41$xmlg['text_hide_tables'] = true;
42$xmlg["useapi"] = false;
43$xmlg["xml_articles_header"] = "<articles>";
44
45// No idea what it does, but it makes it work
46$content_provider = new ContentProviderHTTP;
47
48$converter = new MediaWikiConverter;
49$xml = $converter->article2xml($title, $wikitext , $xmlg);
50
51// To convert to plain text:
52//~ require_once("xml2tree.php");
53//~ require_once("xml2txt.php");
54//~ $x2t = new xml2php ;
55//~ $tree = $x2t->scanString($xml);
56//~ $text = trim($tree->parse($tree));
57
58file_put_contents($argv[2], $xml);
59?>
Note: See TracBrowser for help on using the repository browser.