source: gs2-extensions/ngramj/src/wiki/wikipedia2text/xmldump2files.py@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 2.7 KB
Line 
1#!/usr/bin/python
2# Split a Wikipedia XML dump into individual files. The files are stored in a
3# directory tree based on hashing the title of the article.
4#
5# Evan Jones <[email protected]>
6# April, 2008
7# Released under a BSD licence.
8# http://evanjones.ca/software/wikipedia2text.html
9
10import md5
11import os
12import sys
13import urllib
14import xml.sax
15
16def writeArticle(root, title, text):
17 # ~2.4 million articles at the moment
18 # assuming an even distribution, we want 2 levels of 2 character directories:
19 # 3 million / 256 / 256 = 46
20 # Thus we won't have too many items in any directory
21
22 title = title.encode("UTF-8")
23 hash = md5.new(title).hexdigest()
24 level1 = os.path.join(root, hash[0:2])
25 level2 = os.path.join(level1, hash[2:4])
26
27 # Wikipedia-ize the title for the file name
28 title = title.replace(" ", "_")
29 title = urllib.quote(title)
30 # Special case for /: "%x" % ord("/") == 2f
31 title = title.replace("/", "%2F")
32 title += ".txt"
33 print title
34 filename = os.path.join(level2, title)
35
36 if not os.path.exists(level1):
37 os.mkdir(level1)
38 if not os.path.exists(level2):
39 os.mkdir(level2)
40 if len(filename) < 96:
41 out = open(filename, "w")
42 out.write(text.encode("UTF-8"))
43 out.close()
44
45
46class WikiPageSplitter(xml.sax.ContentHandler):
47 def __init__(self, root):
48 self.root = root
49 self.stack = []
50 self.text = None
51 self.title = None
52
53 def startElement(self, name, attributes):
54 #~ print "start", name
55 if name == "page":
56 assert self.stack == []
57 self.text = None
58 self.title = None
59 elif name == "title":
60 assert self.stack == ["page"]
61 assert self.title is None
62 self.title = ""
63 elif name == "text":
64 assert self.stack == ["page"]
65 assert self.text is None
66 self.text = ""
67 else:
68 assert len(self.stack) == 0 or self.stack[-1] == "page"
69 return
70
71 self.stack.append(name)
72
73 def endElement(self, name):
74 #~ print "end", name
75 if len(self.stack) > 0 and name == self.stack[-1]:
76 del self.stack[-1]
77 if name == "text":
78 # We have the complete article: write it out
79 writeArticle(self.root, self.title, self.text)
80
81 def characters(self, content):
82 assert content is not None and len(content) > 0
83 if len(self.stack) == 0:
84 return
85
86 if self.stack[-1] == "title":
87 self.title += content
88 elif self.stack[-1] == "text":
89 assert self.title is not None
90 self.text += content
91
92
93xml.sax.parse(sys.argv[1], WikiPageSplitter(sys.argv[2]))
Note: See TracBrowser for help on using the repository browser.