Context Navigation

wikiextract.py@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago
NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!
File size: 865 bytes

Line
1	#!/usr/bin/python
2	# Extracts plain text from articles converted into "XML" using wiki2xml_all.
3	#
4	# Evan Jones <[email protected]>
5	# April, 2008
6	# Released under a BSD licence.
7	# http://evanjones.ca/software/wikipedia2text.html
8
9	import os
10	import sys
11
12	import wikisoup
13
14	def findXMLIterator(path):
15	"""Iterates over all .xml files in path."""
16	for file in os.listdir(path):
17	fullpath = os.path.join(path, file)
18	if os.path.isdir(fullpath):
19	for i in findXMLIterator(fullpath):
20	yield i
21	elif fullpath.endswith(".xml"):
22	yield fullpath
23
24	output = open(sys.argv[2], "w")
25
26	# Extract the words from all the files in the subdirectories
27	for xmlfile in findXMLIterator(sys.argv[1]):
28	try:
29	output.write(wikisoup.extractWikipediaText(xmlfile).encode("UTF-8"))
30	except:
31	print xmlfile
32	raise

Note: See TracBrowser for help on using the repository browser.