Last change
on this file since 25141 was 25141, checked in by papitha, 12 years ago |
NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!
|
File size:
851 bytes
|
Line | |
---|
1 | #!/usr/bin/python
|
---|
2 | # Extracts plain text from articles converted into "XML" using wiki2xml_all.
|
---|
3 | #
|
---|
4 | # Evan Jones <[email protected]>
|
---|
5 | # April, 2008
|
---|
6 | # Released under a BSD licence.
|
---|
7 | # http://evanjones.ca/software/wikipedia2text.html
|
---|
8 |
|
---|
9 | import os
|
---|
10 | import sys
|
---|
11 |
|
---|
12 | import wikisoup
|
---|
13 |
|
---|
14 | def findXMLIterator(path):
|
---|
15 | """Iterates over all .xml files in path."""
|
---|
16 | for file in os.listdir(path):
|
---|
17 | fullpath = os.path.join(path, file)
|
---|
18 | if os.path.isdir(fullpath):
|
---|
19 | for i in findXMLIterator(fullpath):
|
---|
20 | yield i
|
---|
21 | elif fullpath.endswith(".xml"):
|
---|
22 | yield fullpath
|
---|
23 |
|
---|
24 | output = open(sys.argv[2], "w")
|
---|
25 |
|
---|
26 | # Extract the words from all the files in the subdirectories
|
---|
27 | for xmlfile in findXMLIterator(sys.argv[1]):
|
---|
28 | try:
|
---|
29 | output.write(wikisoup.extractWikipediaText(xmlfile).encode("UTF-8"))
|
---|
30 | except:
|
---|
31 | print xmlfile
|
---|
Note:
See
TracBrowser
for help on using the repository browser.