Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/cardinal_pythonlib/text.py : 37%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
2# cardinal_pythonlib/text.py
4"""
5===============================================================================
7 Original code copyright (C) 2009-2021 Rudolf Cardinal (rudolf@pobox.com).
9 This file is part of cardinal_pythonlib.
11 Licensed under the Apache License, Version 2.0 (the "License");
12 you may not use this file except in compliance with the License.
13 You may obtain a copy of the License at
15 https://www.apache.org/licenses/LICENSE-2.0
17 Unless required by applicable law or agreed to in writing, software
18 distributed under the License is distributed on an "AS IS" BASIS,
19 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 See the License for the specific language governing permissions and
21 limitations under the License.
23===============================================================================
25**Simple text-processing functions.**
27"""
29from typing import Dict, List, Union
31from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
33log = get_brace_style_log_with_null_handler(__name__)
36# =============================================================================
37# Input support methods
38# =============================================================================
40def escape_newlines(s: str) -> str:
41 """
42 Escapes CR, LF, and backslashes.
44 Its counterpart is :func:`unescape_newlines`.
46 ``s.encode("string_escape")`` and ``s.encode("unicode_escape")`` are
47 alternatives, but they mess around with quotes, too (specifically,
48 backslash-escaping single quotes).
49 """
50 if not s:
51 return s
52 s = s.replace("\\", r"\\") # replace \ with \\
53 s = s.replace("\n", r"\n") # escape \n; note ord("\n") == 10
54 s = s.replace("\r", r"\r") # escape \r; note ord("\r") == 13
55 return s
58def unescape_newlines(s: str) -> str:
59 """
60 Reverses :func:`escape_newlines`.
61 """
62 # See also https://stackoverflow.com/questions/4020539
63 if not s:
64 return s
65 d = "" # the destination string
66 in_escape = False
67 for i in range(len(s)):
68 c = s[i] # the character being processed
69 if in_escape:
70 if c == "r":
71 d += "\r"
72 elif c == "n":
73 d += "\n"
74 else:
75 d += c
76 in_escape = False
77 else:
78 if c == "\\":
79 in_escape = True
80 else:
81 d += c
82 return d
85def escape_tabs_newlines(s: str) -> str:
86 """
87 Escapes CR, LF, tab, and backslashes.
89 Its counterpart is :func:`unescape_tabs_newlines`.
90 """
91 if not s:
92 return s
93 s = s.replace("\\", r"\\") # replace \ with \\
94 s = s.replace("\n", r"\n") # escape \n; note ord("\n") == 10
95 s = s.replace("\r", r"\r") # escape \r; note ord("\r") == 13
96 s = s.replace("\t", r"\t") # escape \t; note ord("\t") == 9
97 return s
100def unescape_tabs_newlines(s: str) -> str:
101 """
102 Reverses :func:`escape_tabs_newlines`.
104 See also https://stackoverflow.com/questions/4020539.
105 """
106 if not s:
107 return s
108 d = "" # the destination string
109 in_escape = False
110 for i in range(len(s)):
111 c = s[i] # the character being processed
112 if in_escape:
113 if c == "r":
114 d += "\r"
115 elif c == "n":
116 d += "\n"
117 elif c == "t":
118 d += "\t"
119 else:
120 d += c
121 in_escape = False
122 else:
123 if c == "\\":
124 in_escape = True
125 else:
126 d += c
127 return d
130# =============================================================================
131# Unicode constants
132# =============================================================================
134def _unicode_def_src_to_str(srclist: List[Union[str, int]]) -> str:
135 """
136 Used to create :data:`UNICODE_CATEGORY_STRINGS`.
138 Args:
139 srclist: list of integers or hex range strings like ``"0061-007A"``
141 Returns:
142 a string with all characters described by ``srclist``: either the
143 character corresponding to the integer Unicode character number, or
144 all characters corresponding to the inclusive range described
145 """
146 charlist = [] # type: List[str]
147 for src in srclist:
148 if isinstance(src, int):
149 charlist.append(chr(src))
150 else:
151 # Range like "0041-005A"
152 first, last = [int(x, 16) for x in src.split("-")]
153 charlist += [chr(x) for x in range(first, last + 1)]
154 return "".join(charlist)
157# https://stackoverflow.com/questions/13233076/determine-if-a-unicode-character-is-alphanumeric-without-using-a-regular-express # noqa
158_UNICODE_CATEGORY_SRC = {
159 # From https://github.com/slevithan/xregexp/blob/master/tools/scripts/property-regex.py # noqa
160 'ASCII': ['0000-007F'],
161 'Alphabetic': ['0041-005A', '0061-007A', 0x00AA, 0x00B5, 0x00BA, '00C0-00D6', '00D8-00F6', '00F8-02C1', '02C6-02D1', '02E0-02E4', 0x02EC, 0x02EE, 0x0345, '0370-0374', 0x0376, 0x0377, '037A-037D', 0x037F, 0x0386, '0388-038A', 0x038C, '038E-03A1', '03A3-03F5', '03F7-0481', '048A-052F', '0531-0556', 0x0559, '0561-0587', '05B0-05BD', 0x05BF, 0x05C1, 0x05C2, 0x05C4, 0x05C5, 0x05C7, '05D0-05EA', '05F0-05F2', '0610-061A', '0620-0657', '0659-065F', '066E-06D3', '06D5-06DC', '06E1-06E8', '06ED-06EF', '06FA-06FC', 0x06FF, '0710-073F', '074D-07B1', '07CA-07EA', 0x07F4, 0x07F5, 0x07FA, '0800-0817', '081A-082C', '0840-0858', '08A0-08B4', '08B6-08BD', '08D4-08DF', '08E3-08E9', '08F0-093B', '093D-094C', '094E-0950', '0955-0963', '0971-0983', '0985-098C', 0x098F, 0x0990, '0993-09A8', '09AA-09B0', 0x09B2, '09B6-09B9', '09BD-09C4', 0x09C7, 0x09C8, 0x09CB, 0x09CC, 0x09CE, 0x09D7, 0x09DC, 0x09DD, '09DF-09E3', 0x09F0, 0x09F1, '0A01-0A03', '0A05-0A0A', 0x0A0F, 0x0A10, '0A13-0A28', '0A2A-0A30', 0x0A32, 0x0A33, 0x0A35, 0x0A36, 0x0A38, 0x0A39, '0A3E-0A42', 0x0A47, 0x0A48, 0x0A4B, 0x0A4C, 0x0A51, '0A59-0A5C', 0x0A5E, '0A70-0A75', '0A81-0A83', '0A85-0A8D', '0A8F-0A91', '0A93-0AA8', '0AAA-0AB0', 0x0AB2, 0x0AB3, '0AB5-0AB9', '0ABD-0AC5', '0AC7-0AC9', 0x0ACB, 0x0ACC, 0x0AD0, '0AE0-0AE3', 0x0AF9, '0B01-0B03', '0B05-0B0C', 0x0B0F, 0x0B10, '0B13-0B28', '0B2A-0B30', 0x0B32, 0x0B33, '0B35-0B39', '0B3D-0B44', 0x0B47, 0x0B48, 0x0B4B, 0x0B4C, 0x0B56, 0x0B57, 0x0B5C, 0x0B5D, '0B5F-0B63', 0x0B71, 0x0B82, 0x0B83, '0B85-0B8A', '0B8E-0B90', '0B92-0B95', 0x0B99, 0x0B9A, 0x0B9C, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, '0BA8-0BAA', '0BAE-0BB9', '0BBE-0BC2', '0BC6-0BC8', '0BCA-0BCC', 0x0BD0, 0x0BD7, '0C00-0C03', '0C05-0C0C', '0C0E-0C10', '0C12-0C28', '0C2A-0C39', '0C3D-0C44', '0C46-0C48', '0C4A-0C4C', 0x0C55, 0x0C56, '0C58-0C5A', '0C60-0C63', '0C80-0C83', '0C85-0C8C', '0C8E-0C90', '0C92-0CA8', '0CAA-0CB3', '0CB5-0CB9', '0CBD-0CC4', '0CC6-0CC8', '0CCA-0CCC', 0x0CD5, 0x0CD6, 0x0CDE, '0CE0-0CE3', 0x0CF1, 0x0CF2, '0D01-0D03', '0D05-0D0C', '0D0E-0D10', '0D12-0D3A', '0D3D-0D44', '0D46-0D48', '0D4A-0D4C', 0x0D4E, '0D54-0D57', '0D5F-0D63', '0D7A-0D7F', 0x0D82, 0x0D83, '0D85-0D96', '0D9A-0DB1', '0DB3-0DBB', 0x0DBD, '0DC0-0DC6', '0DCF-0DD4', 0x0DD6, '0DD8-0DDF', 0x0DF2, 0x0DF3, '0E01-0E3A', '0E40-0E46', 0x0E4D, 0x0E81, 0x0E82, 0x0E84, 0x0E87, 0x0E88, 0x0E8A, 0x0E8D, '0E94-0E97', '0E99-0E9F', '0EA1-0EA3', 0x0EA5, 0x0EA7, 0x0EAA, 0x0EAB, '0EAD-0EB9', '0EBB-0EBD', '0EC0-0EC4', 0x0EC6, 0x0ECD, '0EDC-0EDF', 0x0F00, '0F40-0F47', '0F49-0F6C', '0F71-0F81', '0F88-0F97', '0F99-0FBC', '1000-1036', 0x1038, '103B-103F', '1050-1062', '1065-1068', '106E-1086', 0x108E, 0x109C, 0x109D, '10A0-10C5', 0x10C7, 0x10CD, '10D0-10FA', '10FC-1248', '124A-124D', '1250-1256', 0x1258, '125A-125D', '1260-1288', '128A-128D', '1290-12B0', '12B2-12B5', '12B8-12BE', 0x12C0, '12C2-12C5', '12C8-12D6', '12D8-1310', '1312-1315', '1318-135A', 0x135F, '1380-138F', '13A0-13F5', '13F8-13FD', '1401-166C', '166F-167F', '1681-169A', '16A0-16EA', '16EE-16F8', '1700-170C', '170E-1713', '1720-1733', '1740-1753', '1760-176C', '176E-1770', 0x1772, 0x1773, '1780-17B3', '17B6-17C8', 0x17D7, 0x17DC, '1820-1877', '1880-18AA', '18B0-18F5', '1900-191E', '1920-192B', '1930-1938', '1950-196D', '1970-1974', '1980-19AB', '19B0-19C9', '1A00-1A1B', '1A20-1A5E', '1A61-1A74', 0x1AA7, '1B00-1B33', '1B35-1B43', '1B45-1B4B', '1B80-1BA9', '1BAC-1BAF', '1BBA-1BE5', '1BE7-1BF1', '1C00-1C35', '1C4D-1C4F', '1C5A-1C7D', '1C80-1C88', '1CE9-1CEC', '1CEE-1CF3', 0x1CF5, 0x1CF6, '1D00-1DBF', '1DE7-1DF4', '1E00-1F15', '1F18-1F1D', '1F20-1F45', '1F48-1F4D', '1F50-1F57', 0x1F59, 0x1F5B, 0x1F5D, '1F5F-1F7D', '1F80-1FB4', '1FB6-1FBC', 0x1FBE, '1FC2-1FC4', '1FC6-1FCC', '1FD0-1FD3', '1FD6-1FDB', '1FE0-1FEC', '1FF2-1FF4', '1FF6-1FFC', 0x2071, 0x207F, '2090-209C', 0x2102, 0x2107, '210A-2113', 0x2115, '2119-211D', 0x2124, 0x2126, 0x2128, '212A-212D', '212F-2139', '213C-213F', '2145-2149', 0x214E, '2160-2188', '24B6-24E9', '2C00-2C2E', '2C30-2C5E', '2C60-2CE4', '2CEB-2CEE', 0x2CF2, 0x2CF3, '2D00-2D25', 0x2D27, 0x2D2D, '2D30-2D67', 0x2D6F, '2D80-2D96', '2DA0-2DA6', '2DA8-2DAE', '2DB0-2DB6', '2DB8-2DBE', '2DC0-2DC6', '2DC8-2DCE', '2DD0-2DD6', '2DD8-2DDE', '2DE0-2DFF', 0x2E2F, '3005-3007', '3021-3029', '3031-3035', '3038-303C', '3041-3096', '309D-309F', '30A1-30FA', '30FC-30FF', '3105-312D', '3131-318E', '31A0-31BA', '31F0-31FF', '3400-4DB5', '4E00-9FD5', 'A000-A48C', 'A4D0-A4FD', 'A500-A60C', 'A610-A61F', 0xA62A, 0xA62B, 'A640-A66E', 'A674-A67B', 'A67F-A6EF', 'A717-A71F', 'A722-A788', 'A78B-A7AE', 'A7B0-A7B7', 'A7F7-A801', 'A803-A805', 'A807-A80A', 'A80C-A827', 'A840-A873', 'A880-A8C3', 0xA8C5, 'A8F2-A8F7', 0xA8FB, 0xA8FD, 'A90A-A92A', 'A930-A952', 'A960-A97C', 'A980-A9B2', 'A9B4-A9BF', 0xA9CF, 'A9E0-A9E4', 'A9E6-A9EF', 'A9FA-A9FE', 'AA00-AA36', 'AA40-AA4D', 'AA60-AA76', 0xAA7A, 'AA7E-AABE', 0xAAC0, 0xAAC2, 'AADB-AADD', 'AAE0-AAEF', 'AAF2-AAF5', 'AB01-AB06', 'AB09-AB0E', 'AB11-AB16', 'AB20-AB26', 'AB28-AB2E', 'AB30-AB5A', 'AB5C-AB65', 'AB70-ABEA', 'AC00-D7A3', 'D7B0-D7C6', 'D7CB-D7FB', 'F900-FA6D', 'FA70-FAD9', 'FB00-FB06', 'FB13-FB17', 'FB1D-FB28', 'FB2A-FB36', 'FB38-FB3C', 0xFB3E, 0xFB40, 0xFB41, 0xFB43, 0xFB44, 'FB46-FBB1', 'FBD3-FD3D', 'FD50-FD8F', 'FD92-FDC7', 'FDF0-FDFB', 'FE70-FE74', 'FE76-FEFC', 'FF21-FF3A', 'FF41-FF5A', 'FF66-FFBE', 'FFC2-FFC7', 'FFCA-FFCF', 'FFD2-FFD7', 'FFDA-FFDC', '10000-1000B', '1000D-10026', '10028-1003A', 0x1003C, 0x1003D, '1003F-1004D', '10050-1005D', '10080-100FA', '10140-10174', '10280-1029C', '102A0-102D0', '10300-1031F', '10330-1034A', '10350-1037A', '10380-1039D', '103A0-103C3', '103C8-103CF', '103D1-103D5', '10400-1049D', '104B0-104D3', '104D8-104FB', '10500-10527', '10530-10563', '10600-10736', '10740-10755', '10760-10767', '10800-10805', 0x10808, '1080A-10835', 0x10837, 0x10838, 0x1083C, '1083F-10855', '10860-10876', '10880-1089E', '108E0-108F2', 0x108F4, 0x108F5, '10900-10915', '10920-10939', '10980-109B7', 0x109BE, 0x109BF, '10A00-10A03', 0x10A05, 0x10A06, '10A0C-10A13', '10A15-10A17', '10A19-10A33', '10A60-10A7C', '10A80-10A9C', '10AC0-10AC7', '10AC9-10AE4', '10B00-10B35', '10B40-10B55', '10B60-10B72', '10B80-10B91', '10C00-10C48', '10C80-10CB2', '10CC0-10CF2', '11000-11045', '11082-110B8', '110D0-110E8', '11100-11132', '11150-11172', 0x11176, '11180-111BF', '111C1-111C4', 0x111DA, 0x111DC, '11200-11211', '11213-11234', 0x11237, 0x1123E, '11280-11286', 0x11288, '1128A-1128D', '1128F-1129D', '1129F-112A8', '112B0-112E8', '11300-11303', '11305-1130C', 0x1130F, 0x11310, '11313-11328', '1132A-11330', 0x11332, 0x11333, '11335-11339', '1133D-11344', 0x11347, 0x11348, 0x1134B, 0x1134C, 0x11350, 0x11357, '1135D-11363', '11400-11441', '11443-11445', '11447-1144A', '11480-114C1', 0x114C4, 0x114C5, 0x114C7, '11580-115B5', '115B8-115BE', '115D8-115DD', '11600-1163E', 0x11640, 0x11644, '11680-116B5', '11700-11719', '1171D-1172A', '118A0-118DF', 0x118FF, '11AC0-11AF8', '11C00-11C08', '11C0A-11C36', '11C38-11C3E', 0x11C40, '11C72-11C8F', '11C92-11CA7', '11CA9-11CB6', '12000-12399', '12400-1246E', '12480-12543', '13000-1342E', '14400-14646', '16800-16A38', '16A40-16A5E', '16AD0-16AED', '16B00-16B36', '16B40-16B43', '16B63-16B77', '16B7D-16B8F', '16F00-16F44', '16F50-16F7E', '16F93-16F9F', 0x16FE0, '17000-187EC', '18800-18AF2', 0x1B000, 0x1B001, '1BC00-1BC6A', '1BC70-1BC7C', '1BC80-1BC88', '1BC90-1BC99', 0x1BC9E, '1D400-1D454', '1D456-1D49C', 0x1D49E, 0x1D49F, 0x1D4A2, 0x1D4A5, 0x1D4A6, '1D4A9-1D4AC', '1D4AE-1D4B9', 0x1D4BB, '1D4BD-1D4C3', '1D4C5-1D505', '1D507-1D50A', '1D50D-1D514', '1D516-1D51C', '1D51E-1D539', '1D53B-1D53E', '1D540-1D544', 0x1D546, '1D54A-1D550', '1D552-1D6A5', '1D6A8-1D6C0', '1D6C2-1D6DA', '1D6DC-1D6FA', '1D6FC-1D714', '1D716-1D734', '1D736-1D74E', '1D750-1D76E', '1D770-1D788', '1D78A-1D7A8', '1D7AA-1D7C2', '1D7C4-1D7CB', '1E000-1E006', '1E008-1E018', '1E01B-1E021', 0x1E023, 0x1E024, '1E026-1E02A', '1E800-1E8C4', '1E900-1E943', 0x1E947, '1EE00-1EE03', '1EE05-1EE1F', 0x1EE21, 0x1EE22, 0x1EE24, 0x1EE27, '1EE29-1EE32', '1EE34-1EE37', 0x1EE39, 0x1EE3B, 0x1EE42, 0x1EE47, 0x1EE49, 0x1EE4B, '1EE4D-1EE4F', 0x1EE51, 0x1EE52, 0x1EE54, 0x1EE57, 0x1EE59, 0x1EE5B, 0x1EE5D, 0x1EE5F, 0x1EE61, 0x1EE62, 0x1EE64, '1EE67-1EE6A', '1EE6C-1EE72', '1EE74-1EE77', '1EE79-1EE7C', 0x1EE7E, '1EE80-1EE89', '1EE8B-1EE9B', '1EEA1-1EEA3', '1EEA5-1EEA9', '1EEAB-1EEBB', '1F130-1F149', '1F150-1F169', '1F170-1F189', '20000-2A6D6', '2A700-2B734', '2B740-2B81D', '2B820-2CEA1', '2F800-2FA1D'], # noqa
162 'Any': ['0000-10FFFF'],
163 # 'Assigned': [], # Defined as the inverse of category Cn
164 'Default_Ignorable_Code_Point': [0x00AD, 0x034F, 0x061C, 0x115F, 0x1160, 0x17B4, 0x17B5, '180B-180E', '200B-200F', '202A-202E', '2060-206F', 0x3164, 'FE00-FE0F', 0xFEFF, 0xFFA0, 'FFF0-FFF8', '1BCA0-1BCA3', '1D173-1D17A', 'E0000-E0FFF'], # noqa
165 'Lowercase': ['0061-007A', 0x00AA, 0x00B5, 0x00BA, '00DF-00F6', '00F8-00FF', 0x0101, 0x0103, 0x0105, 0x0107, 0x0109, 0x010B, 0x010D, 0x010F, 0x0111, 0x0113, 0x0115, 0x0117, 0x0119, 0x011B, 0x011D, 0x011F, 0x0121, 0x0123, 0x0125, 0x0127, 0x0129, 0x012B, 0x012D, 0x012F, 0x0131, 0x0133, 0x0135, 0x0137, 0x0138, 0x013A, 0x013C, 0x013E, 0x0140, 0x0142, 0x0144, 0x0146, 0x0148, 0x0149, 0x014B, 0x014D, 0x014F, 0x0151, 0x0153, 0x0155, 0x0157, 0x0159, 0x015B, 0x015D, 0x015F, 0x0161, 0x0163, 0x0165, 0x0167, 0x0169, 0x016B, 0x016D, 0x016F, 0x0171, 0x0173, 0x0175, 0x0177, 0x017A, 0x017C, '017E-0180', 0x0183, 0x0185, 0x0188, 0x018C, 0x018D, 0x0192, 0x0195, '0199-019B', 0x019E, 0x01A1, 0x01A3, 0x01A5, 0x01A8, 0x01AA, 0x01AB, 0x01AD, 0x01B0, 0x01B4, 0x01B6, 0x01B9, 0x01BA, '01BD-01BF', 0x01C6, 0x01C9, 0x01CC, 0x01CE, 0x01D0, 0x01D2, 0x01D4, 0x01D6, 0x01D8, 0x01DA, 0x01DC, 0x01DD, 0x01DF, 0x01E1, 0x01E3, 0x01E5, 0x01E7, 0x01E9, 0x01EB, 0x01ED, 0x01EF, 0x01F0, 0x01F3, 0x01F5, 0x01F9, 0x01FB, 0x01FD, 0x01FF, 0x0201, 0x0203, 0x0205, 0x0207, 0x0209, 0x020B, 0x020D, 0x020F, 0x0211, 0x0213, 0x0215, 0x0217, 0x0219, 0x021B, 0x021D, 0x021F, 0x0221, 0x0223, 0x0225, 0x0227, 0x0229, 0x022B, 0x022D, 0x022F, 0x0231, '0233-0239', 0x023C, 0x023F, 0x0240, 0x0242, 0x0247, 0x0249, 0x024B, 0x024D, '024F-0293', '0295-02B8', 0x02C0, 0x02C1, '02E0-02E4', 0x0345, 0x0371, 0x0373, 0x0377, '037A-037D', 0x0390, '03AC-03CE', 0x03D0, 0x03D1, '03D5-03D7', 0x03D9, 0x03DB, 0x03DD, 0x03DF, 0x03E1, 0x03E3, 0x03E5, 0x03E7, 0x03E9, 0x03EB, 0x03ED, '03EF-03F3', 0x03F5, 0x03F8, 0x03FB, 0x03FC, '0430-045F', 0x0461, 0x0463, 0x0465, 0x0467, 0x0469, 0x046B, 0x046D, 0x046F, 0x0471, 0x0473, 0x0475, 0x0477, 0x0479, 0x047B, 0x047D, 0x047F, 0x0481, 0x048B, 0x048D, 0x048F, 0x0491, 0x0493, 0x0495, 0x0497, 0x0499, 0x049B, 0x049D, 0x049F, 0x04A1, 0x04A3, 0x04A5, 0x04A7, 0x04A9, 0x04AB, 0x04AD, 0x04AF, 0x04B1, 0x04B3, 0x04B5, 0x04B7, 0x04B9, 0x04BB, 0x04BD, 0x04BF, 0x04C2, 0x04C4, 0x04C6, 0x04C8, 0x04CA, 0x04CC, 0x04CE, 0x04CF, 0x04D1, 0x04D3, 0x04D5, 0x04D7, 0x04D9, 0x04DB, 0x04DD, 0x04DF, 0x04E1, 0x04E3, 0x04E5, 0x04E7, 0x04E9, 0x04EB, 0x04ED, 0x04EF, 0x04F1, 0x04F3, 0x04F5, 0x04F7, 0x04F9, 0x04FB, 0x04FD, 0x04FF, 0x0501, 0x0503, 0x0505, 0x0507, 0x0509, 0x050B, 0x050D, 0x050F, 0x0511, 0x0513, 0x0515, 0x0517, 0x0519, 0x051B, 0x051D, 0x051F, 0x0521, 0x0523, 0x0525, 0x0527, 0x0529, 0x052B, 0x052D, 0x052F, '0561-0587', '13F8-13FD', '1C80-1C88', '1D00-1DBF', 0x1E01, 0x1E03, 0x1E05, 0x1E07, 0x1E09, 0x1E0B, 0x1E0D, 0x1E0F, 0x1E11, 0x1E13, 0x1E15, 0x1E17, 0x1E19, 0x1E1B, 0x1E1D, 0x1E1F, 0x1E21, 0x1E23, 0x1E25, 0x1E27, 0x1E29, 0x1E2B, 0x1E2D, 0x1E2F, 0x1E31, 0x1E33, 0x1E35, 0x1E37, 0x1E39, 0x1E3B, 0x1E3D, 0x1E3F, 0x1E41, 0x1E43, 0x1E45, 0x1E47, 0x1E49, 0x1E4B, 0x1E4D, 0x1E4F, 0x1E51, 0x1E53, 0x1E55, 0x1E57, 0x1E59, 0x1E5B, 0x1E5D, 0x1E5F, 0x1E61, 0x1E63, 0x1E65, 0x1E67, 0x1E69, 0x1E6B, 0x1E6D, 0x1E6F, 0x1E71, 0x1E73, 0x1E75, 0x1E77, 0x1E79, 0x1E7B, 0x1E7D, 0x1E7F, 0x1E81, 0x1E83, 0x1E85, 0x1E87, 0x1E89, 0x1E8B, 0x1E8D, 0x1E8F, 0x1E91, 0x1E93, '1E95-1E9D', 0x1E9F, 0x1EA1, 0x1EA3, 0x1EA5, 0x1EA7, 0x1EA9, 0x1EAB, 0x1EAD, 0x1EAF, 0x1EB1, 0x1EB3, 0x1EB5, 0x1EB7, 0x1EB9, 0x1EBB, 0x1EBD, 0x1EBF, 0x1EC1, 0x1EC3, 0x1EC5, 0x1EC7, 0x1EC9, 0x1ECB, 0x1ECD, 0x1ECF, 0x1ED1, 0x1ED3, 0x1ED5, 0x1ED7, 0x1ED9, 0x1EDB, 0x1EDD, 0x1EDF, 0x1EE1, 0x1EE3, 0x1EE5, 0x1EE7, 0x1EE9, 0x1EEB, 0x1EED, 0x1EEF, 0x1EF1, 0x1EF3, 0x1EF5, 0x1EF7, 0x1EF9, 0x1EFB, 0x1EFD, '1EFF-1F07', '1F10-1F15', '1F20-1F27', '1F30-1F37', '1F40-1F45', '1F50-1F57', '1F60-1F67', '1F70-1F7D', '1F80-1F87', '1F90-1F97', '1FA0-1FA7', '1FB0-1FB4', 0x1FB6, 0x1FB7, 0x1FBE, '1FC2-1FC4', 0x1FC6, 0x1FC7, '1FD0-1FD3', 0x1FD6, 0x1FD7, '1FE0-1FE7', '1FF2-1FF4', 0x1FF6, 0x1FF7, 0x2071, 0x207F, '2090-209C', 0x210A, 0x210E, 0x210F, 0x2113, 0x212F, 0x2134, 0x2139, 0x213C, 0x213D, '2146-2149', 0x214E, '2170-217F', 0x2184, '24D0-24E9', '2C30-2C5E', 0x2C61, 0x2C65, 0x2C66, 0x2C68, 0x2C6A, 0x2C6C, 0x2C71, 0x2C73, 0x2C74, '2C76-2C7D', 0x2C81, 0x2C83, 0x2C85, 0x2C87, 0x2C89, 0x2C8B, 0x2C8D, 0x2C8F, 0x2C91, 0x2C93, 0x2C95, 0x2C97, 0x2C99, 0x2C9B, 0x2C9D, 0x2C9F, 0x2CA1, 0x2CA3, 0x2CA5, 0x2CA7, 0x2CA9, 0x2CAB, 0x2CAD, 0x2CAF, 0x2CB1, 0x2CB3, 0x2CB5, 0x2CB7, 0x2CB9, 0x2CBB, 0x2CBD, 0x2CBF, 0x2CC1, 0x2CC3, 0x2CC5, 0x2CC7, 0x2CC9, 0x2CCB, 0x2CCD, 0x2CCF, 0x2CD1, 0x2CD3, 0x2CD5, 0x2CD7, 0x2CD9, 0x2CDB, 0x2CDD, 0x2CDF, 0x2CE1, 0x2CE3, 0x2CE4, 0x2CEC, 0x2CEE, 0x2CF3, '2D00-2D25', 0x2D27, 0x2D2D, 0xA641, 0xA643, 0xA645, 0xA647, 0xA649, 0xA64B, 0xA64D, 0xA64F, 0xA651, 0xA653, 0xA655, 0xA657, 0xA659, 0xA65B, 0xA65D, 0xA65F, 0xA661, 0xA663, 0xA665, 0xA667, 0xA669, 0xA66B, 0xA66D, 0xA681, 0xA683, 0xA685, 0xA687, 0xA689, 0xA68B, 0xA68D, 0xA68F, 0xA691, 0xA693, 0xA695, 0xA697, 0xA699, 'A69B-A69D', 0xA723, 0xA725, 0xA727, 0xA729, 0xA72B, 0xA72D, 'A72F-A731', 0xA733, 0xA735, 0xA737, 0xA739, 0xA73B, 0xA73D, 0xA73F, 0xA741, 0xA743, 0xA745, 0xA747, 0xA749, 0xA74B, 0xA74D, 0xA74F, 0xA751, 0xA753, 0xA755, 0xA757, 0xA759, 0xA75B, 0xA75D, 0xA75F, 0xA761, 0xA763, 0xA765, 0xA767, 0xA769, 0xA76B, 0xA76D, 'A76F-A778', 0xA77A, 0xA77C, 0xA77F, 0xA781, 0xA783, 0xA785, 0xA787, 0xA78C, 0xA78E, 0xA791, 'A793-A795', 0xA797, 0xA799, 0xA79B, 0xA79D, 0xA79F, 0xA7A1, 0xA7A3, 0xA7A5, 0xA7A7, 0xA7A9, 0xA7B5, 0xA7B7, 'A7F8-A7FA', 'AB30-AB5A', 'AB5C-AB65', 'AB70-ABBF', 'FB00-FB06', 'FB13-FB17', 'FF41-FF5A', '10428-1044F', '104D8-104FB', '10CC0-10CF2', '118C0-118DF', '1D41A-1D433', '1D44E-1D454', '1D456-1D467', '1D482-1D49B', '1D4B6-1D4B9', 0x1D4BB, '1D4BD-1D4C3', '1D4C5-1D4CF', '1D4EA-1D503', '1D51E-1D537', '1D552-1D56B', '1D586-1D59F', '1D5BA-1D5D3', '1D5EE-1D607', '1D622-1D63B', '1D656-1D66F', '1D68A-1D6A5', '1D6C2-1D6DA', '1D6DC-1D6E1', '1D6FC-1D714', '1D716-1D71B', '1D736-1D74E', '1D750-1D755', '1D770-1D788', '1D78A-1D78F', '1D7AA-1D7C2', '1D7C4-1D7C9', 0x1D7CB, '1E922-1E943'], # noqa
166 'Noncharacter_Code_Point': ['FDD0-FDEF', 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF], # noqa
167 'Uppercase': ['0041-005A', '00C0-00D6', '00D8-00DE', 0x0100, 0x0102, 0x0104, 0x0106, 0x0108, 0x010A, 0x010C, 0x010E, 0x0110, 0x0112, 0x0114, 0x0116, 0x0118, 0x011A, 0x011C, 0x011E, 0x0120, 0x0122, 0x0124, 0x0126, 0x0128, 0x012A, 0x012C, 0x012E, 0x0130, 0x0132, 0x0134, 0x0136, 0x0139, 0x013B, 0x013D, 0x013F, 0x0141, 0x0143, 0x0145, 0x0147, 0x014A, 0x014C, 0x014E, 0x0150, 0x0152, 0x0154, 0x0156, 0x0158, 0x015A, 0x015C, 0x015E, 0x0160, 0x0162, 0x0164, 0x0166, 0x0168, 0x016A, 0x016C, 0x016E, 0x0170, 0x0172, 0x0174, 0x0176, 0x0178, 0x0179, 0x017B, 0x017D, 0x0181, 0x0182, 0x0184, 0x0186, 0x0187, '0189-018B', '018E-0191', 0x0193, 0x0194, '0196-0198', 0x019C, 0x019D, 0x019F, 0x01A0, 0x01A2, 0x01A4, 0x01A6, 0x01A7, 0x01A9, 0x01AC, 0x01AE, 0x01AF, '01B1-01B3', 0x01B5, 0x01B7, 0x01B8, 0x01BC, 0x01C4, 0x01C7, 0x01CA, 0x01CD, 0x01CF, 0x01D1, 0x01D3, 0x01D5, 0x01D7, 0x01D9, 0x01DB, 0x01DE, 0x01E0, 0x01E2, 0x01E4, 0x01E6, 0x01E8, 0x01EA, 0x01EC, 0x01EE, 0x01F1, 0x01F4, '01F6-01F8', 0x01FA, 0x01FC, 0x01FE, 0x0200, 0x0202, 0x0204, 0x0206, 0x0208, 0x020A, 0x020C, 0x020E, 0x0210, 0x0212, 0x0214, 0x0216, 0x0218, 0x021A, 0x021C, 0x021E, 0x0220, 0x0222, 0x0224, 0x0226, 0x0228, 0x022A, 0x022C, 0x022E, 0x0230, 0x0232, 0x023A, 0x023B, 0x023D, 0x023E, 0x0241, '0243-0246', 0x0248, 0x024A, 0x024C, 0x024E, 0x0370, 0x0372, 0x0376, 0x037F, 0x0386, '0388-038A', 0x038C, 0x038E, 0x038F, '0391-03A1', '03A3-03AB', 0x03CF, '03D2-03D4', 0x03D8, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x03E2, 0x03E4, 0x03E6, 0x03E8, 0x03EA, 0x03EC, 0x03EE, 0x03F4, 0x03F7, 0x03F9, 0x03FA, '03FD-042F', 0x0460, 0x0462, 0x0464, 0x0466, 0x0468, 0x046A, 0x046C, 0x046E, 0x0470, 0x0472, 0x0474, 0x0476, 0x0478, 0x047A, 0x047C, 0x047E, 0x0480, 0x048A, 0x048C, 0x048E, 0x0490, 0x0492, 0x0494, 0x0496, 0x0498, 0x049A, 0x049C, 0x049E, 0x04A0, 0x04A2, 0x04A4, 0x04A6, 0x04A8, 0x04AA, 0x04AC, 0x04AE, 0x04B0, 0x04B2, 0x04B4, 0x04B6, 0x04B8, 0x04BA, 0x04BC, 0x04BE, 0x04C0, 0x04C1, 0x04C3, 0x04C5, 0x04C7, 0x04C9, 0x04CB, 0x04CD, 0x04D0, 0x04D2, 0x04D4, 0x04D6, 0x04D8, 0x04DA, 0x04DC, 0x04DE, 0x04E0, 0x04E2, 0x04E4, 0x04E6, 0x04E8, 0x04EA, 0x04EC, 0x04EE, 0x04F0, 0x04F2, 0x04F4, 0x04F6, 0x04F8, 0x04FA, 0x04FC, 0x04FE, 0x0500, 0x0502, 0x0504, 0x0506, 0x0508, 0x050A, 0x050C, 0x050E, 0x0510, 0x0512, 0x0514, 0x0516, 0x0518, 0x051A, 0x051C, 0x051E, 0x0520, 0x0522, 0x0524, 0x0526, 0x0528, 0x052A, 0x052C, 0x052E, '0531-0556', '10A0-10C5', 0x10C7, 0x10CD, '13A0-13F5', 0x1E00, 0x1E02, 0x1E04, 0x1E06, 0x1E08, 0x1E0A, 0x1E0C, 0x1E0E, 0x1E10, 0x1E12, 0x1E14, 0x1E16, 0x1E18, 0x1E1A, 0x1E1C, 0x1E1E, 0x1E20, 0x1E22, 0x1E24, 0x1E26, 0x1E28, 0x1E2A, 0x1E2C, 0x1E2E, 0x1E30, 0x1E32, 0x1E34, 0x1E36, 0x1E38, 0x1E3A, 0x1E3C, 0x1E3E, 0x1E40, 0x1E42, 0x1E44, 0x1E46, 0x1E48, 0x1E4A, 0x1E4C, 0x1E4E, 0x1E50, 0x1E52, 0x1E54, 0x1E56, 0x1E58, 0x1E5A, 0x1E5C, 0x1E5E, 0x1E60, 0x1E62, 0x1E64, 0x1E66, 0x1E68, 0x1E6A, 0x1E6C, 0x1E6E, 0x1E70, 0x1E72, 0x1E74, 0x1E76, 0x1E78, 0x1E7A, 0x1E7C, 0x1E7E, 0x1E80, 0x1E82, 0x1E84, 0x1E86, 0x1E88, 0x1E8A, 0x1E8C, 0x1E8E, 0x1E90, 0x1E92, 0x1E94, 0x1E9E, 0x1EA0, 0x1EA2, 0x1EA4, 0x1EA6, 0x1EA8, 0x1EAA, 0x1EAC, 0x1EAE, 0x1EB0, 0x1EB2, 0x1EB4, 0x1EB6, 0x1EB8, 0x1EBA, 0x1EBC, 0x1EBE, 0x1EC0, 0x1EC2, 0x1EC4, 0x1EC6, 0x1EC8, 0x1ECA, 0x1ECC, 0x1ECE, 0x1ED0, 0x1ED2, 0x1ED4, 0x1ED6, 0x1ED8, 0x1EDA, 0x1EDC, 0x1EDE, 0x1EE0, 0x1EE2, 0x1EE4, 0x1EE6, 0x1EE8, 0x1EEA, 0x1EEC, 0x1EEE, 0x1EF0, 0x1EF2, 0x1EF4, 0x1EF6, 0x1EF8, 0x1EFA, 0x1EFC, 0x1EFE, '1F08-1F0F', '1F18-1F1D', '1F28-1F2F', '1F38-1F3F', '1F48-1F4D', 0x1F59, 0x1F5B, 0x1F5D, 0x1F5F, '1F68-1F6F', '1FB8-1FBB', '1FC8-1FCB', '1FD8-1FDB', '1FE8-1FEC', '1FF8-1FFB', 0x2102, 0x2107, '210B-210D', '2110-2112', 0x2115, '2119-211D', 0x2124, 0x2126, 0x2128, '212A-212D', '2130-2133', 0x213E, 0x213F, 0x2145, '2160-216F', 0x2183, '24B6-24CF', '2C00-2C2E', 0x2C60, '2C62-2C64', 0x2C67, 0x2C69, 0x2C6B, '2C6D-2C70', 0x2C72, 0x2C75, '2C7E-2C80', 0x2C82, 0x2C84, 0x2C86, 0x2C88, 0x2C8A, 0x2C8C, 0x2C8E, 0x2C90, 0x2C92, 0x2C94, 0x2C96, 0x2C98, 0x2C9A, 0x2C9C, 0x2C9E, 0x2CA0, 0x2CA2, 0x2CA4, 0x2CA6, 0x2CA8, 0x2CAA, 0x2CAC, 0x2CAE, 0x2CB0, 0x2CB2, 0x2CB4, 0x2CB6, 0x2CB8, 0x2CBA, 0x2CBC, 0x2CBE, 0x2CC0, 0x2CC2, 0x2CC4, 0x2CC6, 0x2CC8, 0x2CCA, 0x2CCC, 0x2CCE, 0x2CD0, 0x2CD2, 0x2CD4, 0x2CD6, 0x2CD8, 0x2CDA, 0x2CDC, 0x2CDE, 0x2CE0, 0x2CE2, 0x2CEB, 0x2CED, 0x2CF2, 0xA640, 0xA642, 0xA644, 0xA646, 0xA648, 0xA64A, 0xA64C, 0xA64E, 0xA650, 0xA652, 0xA654, 0xA656, 0xA658, 0xA65A, 0xA65C, 0xA65E, 0xA660, 0xA662, 0xA664, 0xA666, 0xA668, 0xA66A, 0xA66C, 0xA680, 0xA682, 0xA684, 0xA686, 0xA688, 0xA68A, 0xA68C, 0xA68E, 0xA690, 0xA692, 0xA694, 0xA696, 0xA698, 0xA69A, 0xA722, 0xA724, 0xA726, 0xA728, 0xA72A, 0xA72C, 0xA72E, 0xA732, 0xA734, 0xA736, 0xA738, 0xA73A, 0xA73C, 0xA73E, 0xA740, 0xA742, 0xA744, 0xA746, 0xA748, 0xA74A, 0xA74C, 0xA74E, 0xA750, 0xA752, 0xA754, 0xA756, 0xA758, 0xA75A, 0xA75C, 0xA75E, 0xA760, 0xA762, 0xA764, 0xA766, 0xA768, 0xA76A, 0xA76C, 0xA76E, 0xA779, 0xA77B, 0xA77D, 0xA77E, 0xA780, 0xA782, 0xA784, 0xA786, 0xA78B, 0xA78D, 0xA790, 0xA792, 0xA796, 0xA798, 0xA79A, 0xA79C, 0xA79E, 0xA7A0, 0xA7A2, 0xA7A4, 0xA7A6, 0xA7A8, 'A7AA-A7AE', 'A7B0-A7B4', 0xA7B6, 'FF21-FF3A', '10400-10427', '104B0-104D3', '10C80-10CB2', '118A0-118BF', '1D400-1D419', '1D434-1D44D', '1D468-1D481', 0x1D49C, 0x1D49E, 0x1D49F, 0x1D4A2, 0x1D4A5, 0x1D4A6, '1D4A9-1D4AC', '1D4AE-1D4B5', '1D4D0-1D4E9', 0x1D504, 0x1D505, '1D507-1D50A', '1D50D-1D514', '1D516-1D51C', 0x1D538, 0x1D539, '1D53B-1D53E', '1D540-1D544', 0x1D546, '1D54A-1D550', '1D56C-1D585', '1D5A0-1D5B9', '1D5D4-1D5ED', '1D608-1D621', '1D63C-1D655', '1D670-1D689', '1D6A8-1D6C0', '1D6E2-1D6FA', '1D71C-1D734', '1D756-1D76E', '1D790-1D7A8', 0x1D7CA, '1E900-1E921', '1F130-1F149', '1F150-1F169', '1F170-1F189'], # noqa
168 'White_Space': ['0009-000D', 0x0020, 0x0085, 0x00A0, 0x1680, '2000-200A', 0x2028, 0x2029, 0x202F, 0x205F, 0x3000], # noqa
170 # From https://en.wikipedia.org/wiki/Latin_script_in_Unicode
171 'Latin': [
172 '0000-007F', # Basic Latin; this block corresponds to ASCII.
173 '0080-00FF', # Latin-1 Supplement
174 '0100-017F', # Latin Extended-A
175 '0180-024F', # Latin Extended-B
176 '0250-02AF', # IPA Extensions
177 '02B0-02FF', # Spacing Modifier Letters
178 '1D00-1D7F', # Phonetic Extensions
179 '1D80-1DBF', # Phonetic Extensions Supplement
180 '1E00-1EFF', # Latin Extended Additional
181 '2070-209F', # Superscripts and Subscripts
182 '2100-214F', # Letterlike Symbols
183 '2150-218F', # Number Forms
184 '2C60-2C7F', # Latin Extended-C
185 'A720-A7FF', # Latin Extended-D
186 'AB30-AB6F', # Latin Extended-E
187 'FB00-FB4F', # Alphabetic Presentation Forms (Latin ligatures)
188 'FF00-FFEF', # Halfwidth and Fullwidth Forms
189 ],
191 # RNC, from the Wikipedia chart above:
192 'Latin_Alphabetic': [
193 # @
194 '0041-005A', # Basic Latin: A-Z
195 # [\]^_`
196 '0061-007A', # Basic Latin: a-z
197 # {|}~ then mishmash symbols
198 0x00B5, # Basic Latin: mu
199 # more symbols
200 '00C0-00D6', # Basic Latin: accented capitals
201 # multiplication symbol
202 '00D8-00F6', # Basic Latin: more accented capitals, something odd, Eszett, accented lower case # noqa
203 # division symbol
204 '00F8-00FF', # Basic Latin: more accented...
206 '0100-017F', # Latin Extended-A
207 '0180-024F', # Latin Extended-B
208 # IPA Extensions
209 # Spacing Modifier Letters
210 # '1D00-1D7F', # Phonetic Extensions
211 # '1D80-1DBF', # Phonetic Extensions Supplement
212 '1E00-1EFF', # Latin Extended Additional
213 # '2070-209F', # Superscripts and Subscripts
214 # '2100-214F', # Letterlike Symbols
215 # '2150-218F', # Number Forms
216 '2C60-2C7F', # Latin Extended-C
217 'A720-A7AC', # Latin Extended-D: part 1
218 'A7B0-A7B7', # Latin Extended-D: part 2
219 'A7F7-A7FF', # Latin Extended-D: part 3
220 'AB30-AB65', # Latin Extended-E: those assigned
221 'FB00-FB06', # Alphabetic Presentation Forms (Latin ligatures): those assigned # noqa
222 'FF20-FF5F', # Halfwidth and Fullwidth Forms: those assigned
223 ],
224}
227def get_unicode_category_strings() -> Dict[str, str]:
228 """
229 Returns a dictionary mapping Unicode categories (e.g. "ASCII") to a string
230 containing those characters.
232 This is large (~5 Mb) so don't call it unnecessarily and don't have it as a
233 module-level variable.
235 NB 'Alphabetic' has length 118240; 'Latin_Alphabetic' only 1022.
236 """
237 return {k: _unicode_def_src_to_str(v)
238 for k, v in _UNICODE_CATEGORY_SRC.items()}
241def get_unicode_characters(category: str) -> str:
242 """
243 Args:
244 category:
245 a Unicode category, e.g. "ASCII"
247 Returns:
248 str: a string containing those characters
250 Raises:
251 :exc:`KeyError` if the category is bad
252 """
253 definition_strings = _UNICODE_CATEGORY_SRC[category]
254 return _unicode_def_src_to_str(definition_strings)