1 | #!/usr/bin/env PYTHONIOENCODING=utf-8 python
|
---|
2 | # encoding: utf-8
|
---|
3 |
|
---|
4 | # Created by Chris Adams, source (https://gist.github.com/acdha/49a610089c2798db6fe2)
|
---|
5 |
|
---|
6 | from __future__ import absolute_import, print_function, unicode_literals
|
---|
7 |
|
---|
8 | import os
|
---|
9 | import re
|
---|
10 |
|
---|
11 | import requests
|
---|
12 |
|
---|
13 |
|
---|
14 | def get_block_for_codepoint(cp):
|
---|
15 | """Return the Unicode block name for the provided numeric codepoint"""
|
---|
16 |
|
---|
17 | for start, end, block_name in UNICODE_BLOCKS:
|
---|
18 | if start <= cp <= end:
|
---|
19 | return block_name
|
---|
20 |
|
---|
21 | return 'No_Block'
|
---|
22 |
|
---|
23 |
|
---|
24 | def load_unicode_blocks_from_file(f):
|
---|
25 | file_contents = f.read().decode('utf-8')
|
---|
26 |
|
---|
27 | blocks = []
|
---|
28 | for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents):
|
---|
29 | if block_name == 'No_Block':
|
---|
30 | continue
|
---|
31 |
|
---|
32 | blocks.append((int(start, 16), int(end, 16), block_name))
|
---|
33 |
|
---|
34 | return blocks
|
---|
35 |
|
---|
36 |
|
---|
37 | def load_unicode_blocks(block_filename):
|
---|
38 | if not os.path.exists(block_filename):
|
---|
39 | print('Unicode block file %s does not exist. DownloadingâŠ' % block_filename)
|
---|
40 | r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt')
|
---|
41 | r.raise_for_status()
|
---|
42 |
|
---|
43 | with open(block_filename, 'wb') as f:
|
---|
44 | for chunk in r.iter_content():
|
---|
45 | f.write(chunk)
|
---|
46 |
|
---|
47 | with open(block_filename, 'rb') as f:
|
---|
48 | blocks = load_unicode_blocks_from_file(f)
|
---|
49 |
|
---|
50 | return blocks
|
---|
51 |
|
---|
52 | UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt')
|
---|