source: other-projects/metadata-encoding/py/using-api/get_unicode_blocks.py@ 38791

Last change on this file since 38791 was 38791, checked in by jc550, 4 months ago

add comments adding context to functions that require it

File size: 1.4 KB
Line 
1#!/usr/bin/env PYTHONIOENCODING=utf-8 python
2# encoding: utf-8
3
4# This is code from Chris Adams, source https://gist.github.com/acdha/49a610089c2798db6fe2
5
6from __future__ import absolute_import, print_function, unicode_literals
7
8import os
9import re
10
11import requests
12
13
14def get_block_for_codepoint(cp):
15 """Return the Unicode block name for the provided numeric codepoint"""
16
17 for start, end, block_name in UNICODE_BLOCKS:
18 if start <= cp <= end:
19 return block_name
20
21 return 'No_Block'
22
23
24def load_unicode_blocks_from_file(f):
25 file_contents = f.read().decode('utf-8')
26
27 blocks = []
28 for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents):
29 if block_name == 'No_Block':
30 continue
31
32 blocks.append((int(start, 16), int(end, 16), block_name))
33
34 return blocks
35
36
37def load_unicode_blocks(block_filename):
38 if not os.path.exists(block_filename):
39 print('Unicode block file %s does not exist. Downloading
' % block_filename)
40 r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt')
41 r.raise_for_status()
42
43 with open(block_filename, 'wb') as f:
44 for chunk in r.iter_content():
45 f.write(chunk)
46
47 with open(block_filename, 'rb') as f:
48 blocks = load_unicode_blocks_from_file(f)
49
50 return blocks
51
52UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt')
Note: See TracBrowser for help on using the repository browser.