1 | #!/usr/bin/python
|
---|
2 | # Split a Wikipedia XML dump into individual files. The files are stored in a
|
---|
3 | # directory tree based on hashing the title of the article.
|
---|
4 | #
|
---|
5 | # Evan Jones <[email protected]>
|
---|
6 | # April, 2008
|
---|
7 | # Released under a BSD licence.
|
---|
8 | # http://evanjones.ca/software/wikipedia2text.html
|
---|
9 |
|
---|
10 | import md5
|
---|
11 | import os
|
---|
12 | import sys
|
---|
13 | import urllib
|
---|
14 | import xml.sax
|
---|
15 |
|
---|
16 | def writeArticle(root, title, text):
|
---|
17 | # ~2.4 million articles at the moment
|
---|
18 | # assuming an even distribution, we want 2 levels of 2 character directories:
|
---|
19 | # 3 million / 256 / 256 = 46
|
---|
20 | # Thus we won't have too many items in any directory
|
---|
21 |
|
---|
22 | title = title.encode("UTF-8")
|
---|
23 | hash = md5.new(title).hexdigest()
|
---|
24 | level1 = os.path.join(root, hash[0:2])
|
---|
25 | level2 = os.path.join(level1, hash[2:4])
|
---|
26 |
|
---|
27 | # Wikipedia-ize the title for the file name
|
---|
28 | title = title.replace(" ", "_")
|
---|
29 | title = urllib.quote(title)
|
---|
30 | # Special case for /: "%x" % ord("/") == 2f
|
---|
31 | title = title.replace("/", "%2F")
|
---|
32 | title += ".txt"
|
---|
33 | print title
|
---|
34 | filename = os.path.join(level2, title)
|
---|
35 |
|
---|
36 | if not os.path.exists(level1):
|
---|
37 | os.mkdir(level1)
|
---|
38 | if not os.path.exists(level2):
|
---|
39 | os.mkdir(level2)
|
---|
40 | if len(filename) < 96:
|
---|
41 | out = open(filename, "w")
|
---|
42 | out.write(text.encode("UTF-8"))
|
---|
43 | out.close()
|
---|
44 |
|
---|
45 |
|
---|
46 | class WikiPageSplitter(xml.sax.ContentHandler):
|
---|
47 | def __init__(self, root):
|
---|
48 | self.root = root
|
---|
49 | self.stack = []
|
---|
50 | self.text = None
|
---|
51 | self.title = None
|
---|
52 |
|
---|
53 | def startElement(self, name, attributes):
|
---|
54 | #~ print "start", name
|
---|
55 | if name == "page":
|
---|
56 | assert self.stack == []
|
---|
57 | self.text = None
|
---|
58 | self.title = None
|
---|
59 | elif name == "title":
|
---|
60 | assert self.stack == ["page"]
|
---|
61 | assert self.title is None
|
---|
62 | self.title = ""
|
---|
63 | elif name == "text":
|
---|
64 | assert self.stack == ["page"]
|
---|
65 | assert self.text is None
|
---|
66 | self.text = ""
|
---|
67 | else:
|
---|
68 | assert len(self.stack) == 0 or self.stack[-1] == "page"
|
---|
69 | return
|
---|
70 |
|
---|
71 | self.stack.append(name)
|
---|
72 |
|
---|
73 | def endElement(self, name):
|
---|
74 | #~ print "end", name
|
---|
75 | if len(self.stack) > 0 and name == self.stack[-1]:
|
---|
76 | del self.stack[-1]
|
---|
77 | if name == "text":
|
---|
78 | # We have the complete article: write it out
|
---|
79 | writeArticle(self.root, self.title, self.text)
|
---|
80 |
|
---|
81 | def characters(self, content):
|
---|
82 | assert content is not None and len(content) > 0
|
---|
83 | if len(self.stack) == 0:
|
---|
84 | return
|
---|
85 |
|
---|
86 | if self.stack[-1] == "title":
|
---|
87 | self.title += content
|
---|
88 | elif self.stack[-1] == "text":
|
---|
89 | assert self.title is not None
|
---|
90 | self.text += content
|
---|
91 |
|
---|
92 |
|
---|
93 | xml.sax.parse(sys.argv[1], WikiPageSplitter(sys.argv[2]))
|
---|