Last change
on this file since 25141 was 25141, checked in by papitha, 12 years ago |
NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!
|
File size:
851 bytes
|
Rev | Line | |
---|
[25141] | 1 | #!/usr/bin/python
|
---|
| 2 | # Extracts plain text from articles converted into "XML" using wiki2xml_all.
|
---|
| 3 | #
|
---|
| 4 | # Evan Jones <[email protected]>
|
---|
| 5 | # April, 2008
|
---|
| 6 | # Released under a BSD licence.
|
---|
| 7 | # http://evanjones.ca/software/wikipedia2text.html
|
---|
| 8 |
|
---|
| 9 | import os
|
---|
| 10 | import sys
|
---|
| 11 |
|
---|
| 12 | import wikisoup
|
---|
| 13 |
|
---|
| 14 | def findXMLIterator(path):
|
---|
| 15 | """Iterates over all .xml files in path."""
|
---|
| 16 | for file in os.listdir(path):
|
---|
| 17 | fullpath = os.path.join(path, file)
|
---|
| 18 | if os.path.isdir(fullpath):
|
---|
| 19 | for i in findXMLIterator(fullpath):
|
---|
| 20 | yield i
|
---|
| 21 | elif fullpath.endswith(".xml"):
|
---|
| 22 | yield fullpath
|
---|
| 23 |
|
---|
| 24 | output = open(sys.argv[2], "w")
|
---|
| 25 |
|
---|
| 26 | # Extract the words from all the files in the subdirectories
|
---|
| 27 | for xmlfile in findXMLIterator(sys.argv[1]):
|
---|
| 28 | try:
|
---|
| 29 | output.write(wikisoup.extractWikipediaText(xmlfile).encode("UTF-8"))
|
---|
| 30 | except:
|
---|
| 31 | print xmlfile
|
---|
Note:
See
TracBrowser
for help on using the repository browser.