source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/wiki2xml_multi.php

Last change on this file was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 1.5 KB
Line 
1<!--
2Converts Wikipedia articles in wiki format into an XML format. It might
3segfault or go into an "infinite" loop sometimes.
4
5Evan Jones <[email protected]>
6April, 2008
7Released under a BSD licence.
8http://evanjones.ca/software/wikipedia2text.html
9-->
10
11<?php
12error_reporting(E_ALL);
13require_once("mediawiki_converter.php");
14
15$stdin = fopen('php://stdin', 'r');
16while (1)
17{
18 $file = fgets($stdin);
19 $file = chop($file);
20
21 if (strcmp($file, "") == 0)
22 {
23 break;
24 }
25
26 $wikitext = file_get_contents($file);
27 if (strlen($wikitext) > 0) {
28 echo "$file\n";
29 $filename_parts = explode("/", $file);
30 $title = $filename_parts[count($filename_parts)-1];
31 $title = str_replace(".txt", "", $title);
32 $title = urldecode($title);
33
34 // Configures options for converting to XML
35 $xmlg = array();
36 $xmlg["usetemplates"] = "none";
37 $xmlg["resolvetemplates"] = "none";
38 $xmlg["templates"] = array();
39 $xmlg['add_gfdl'] = false;
40 $xmlg['keep_interlanguage'] = false;
41 $xmlg['keep_categories'] = false;
42 $xmlg['text_hide_images'] = true;
43 $xmlg['text_hide_tables'] = true;
44 $xmlg["useapi"] = false;
45 $xmlg["xml_articles_header"] = "<articles>";
46
47 // No idea what it does, but it makes it work
48 $content_provider = new ContentProviderHTTP;
49
50 $converter = new MediaWikiConverter;
51 $xml = $converter->article2xml($title, $wikitext , $xmlg);
52 file_put_contents(str_replace('.txt', '.xml', $file), $xml);
53 echo str_replace('.txt', '.xml', $file) . "\n";
54 }
55}
56
57exit(1);
58?>
Note: See TracBrowser for help on using the repository browser.