source: gs2-extensions/ngramj/src/wiki/wikiextract.py@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 865 bytes
Line 
1#!/usr/bin/python
2# Extracts plain text from articles converted into "XML" using wiki2xml_all.
3#
4# Evan Jones <[email protected]>
5# April, 2008
6# Released under a BSD licence.
7# http://evanjones.ca/software/wikipedia2text.html
8
9import os
10import sys
11
12import wikisoup
13
14def findXMLIterator(path):
15 """Iterates over all .xml files in path."""
16 for file in os.listdir(path):
17 fullpath = os.path.join(path, file)
18 if os.path.isdir(fullpath):
19 for i in findXMLIterator(fullpath):
20 yield i
21 elif fullpath.endswith(".xml"):
22 yield fullpath
23
24output = open(sys.argv[2], "w")
25
26# Extract the words from all the files in the subdirectories
27for xmlfile in findXMLIterator(sys.argv[1]):
28 try:
29 output.write(wikisoup.extractWikipediaText(xmlfile).encode("UTF-8"))
30 except:
31 print xmlfile
32 raise
Note: See TracBrowser for help on using the repository browser.