From df60e5384be346d6d7080c792252083431bfe8a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 7 Apr 2026 15:58:03 +0200 Subject: [PATCH 1/6] Add pure Python implementation of unicodedata.iter_graphemes() New module Lib/_py_grapheme.py implements the full Unicode TR29 Extended Grapheme Cluster algorithm in pure Python, using the unicodedata.grapheme_cluster_break(), extended_pictographic(), and indic_conjunct_break() property accessors. Refactored GraphemeBreakTest into a BaseGraphemeBreakTest mixin so that both C and pure Python implementations share the same test suite, including the TR29 conformance test against GraphemeBreakTest.txt. Co-Authored-By: Claude Opus 4.6 (1M context) --- Lib/_py_grapheme.py | 198 +++++++++++++++++++++++++++++++++++ Lib/test/test_unicodedata.py | 133 ++++++++++++++++++++--- 2 files changed, 318 insertions(+), 13 deletions(-) create mode 100644 Lib/_py_grapheme.py diff --git a/Lib/_py_grapheme.py b/Lib/_py_grapheme.py new file mode 100644 index 00000000000000..95565caab3fedc --- /dev/null +++ b/Lib/_py_grapheme.py @@ -0,0 +1,198 @@ +"""Pure Python implementation of unicodedata.iter_graphemes(). + +Uses the extended grapheme cluster rules from Unicode TR29. +""" + +import sys +import unicodedata + + +class Segment: + """Represents a grapheme cluster segment within a string.""" + + __slots__ = ('_string', 'start', 'end') + + def __init__(self, string, start, end): + self._string = string + self.start = start + self.end = end + + def __str__(self): + return self._string[self.start:self.end] + + def __repr__(self): + return f"" + + +# Grapheme_Cluster_Break property values (matching C #defines) +_GCB_Other = "Other" +_GCB_Prepend = "Prepend" +_GCB_CR = "CR" +_GCB_LF = "LF" +_GCB_Control = "Control" +_GCB_Extend = "Extend" +_GCB_Regional_Indicator = "Regional_Indicator" +_GCB_SpacingMark = "SpacingMark" +_GCB_L = "L" +_GCB_V = "V" +_GCB_T = "T" +_GCB_LV = "LV" +_GCB_LVT = "LVT" +_GCB_ZWJ = "ZWJ" + +# Indic_Conjunct_Break property values +_InCB_None = "None" +_InCB_Linker = "Linker" +_InCB_Consonant = "Consonant" +_InCB_Extend = "Extend" + +# Extended Pictographic FSM states (for GB11) +_EP_INIT = 0 +_EP_STARTED = 1 +_EP_ZWJ = 2 +_EP_MATCHED = 3 + +# Indic Conjunct Break FSM states (for GB9c) +_INCB_INIT = 0 +_INCB_STARTED = 1 +_INCB_LINKER = 2 +_INCB_MATCHED = 3 + + +def _update_ext_pict_state(state, gcb, ext_pict): + if ext_pict: + return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED + if state == _EP_STARTED or state == _EP_MATCHED: + if gcb == _GCB_Extend: + return _EP_STARTED + if gcb == _GCB_ZWJ: + return _EP_ZWJ + return _EP_INIT + + +def _update_incb_state(state, incb): + if incb == _InCB_Consonant: + return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED + if state != _INCB_INIT: + if incb == _InCB_Extend: + return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED + if incb == _InCB_Linker: + return _INCB_LINKER + return _INCB_INIT + + +def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state): + """Return True if a grapheme cluster break occurs between two characters.""" + # GB3: Do not break between a CR and LF. + if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF: + return False + + # GB4: Break after controls. + if prev_gcb in (_GCB_CR, _GCB_LF, _GCB_Control): + return True + + # GB5: Break before controls. + if curr_gcb in (_GCB_CR, _GCB_LF, _GCB_Control): + return True + + # GB6: Do not break Hangul syllable sequences (L). + if prev_gcb == _GCB_L and curr_gcb in (_GCB_L, _GCB_V, _GCB_LV, _GCB_LVT): + return False + + # GB7: Do not break Hangul syllable sequences (LV, V). + if prev_gcb in (_GCB_LV, _GCB_V) and curr_gcb in (_GCB_V, _GCB_T): + return False + + # GB8: Do not break Hangul syllable sequences (LVT, T). + if prev_gcb in (_GCB_LVT, _GCB_T) and curr_gcb == _GCB_T: + return False + + # GB9: Do not break before extending characters or ZWJ. + if curr_gcb in (_GCB_Extend, _GCB_ZWJ): + return False + + # GB9a: Do not break before SpacingMarks. + if curr_gcb == _GCB_SpacingMark: + return False + + # GB9b: Do not break after Prepend characters. + if prev_gcb == _GCB_Prepend: + return False + + # GB9c: Do not break within Indic conjunct clusters. + if incb_state == _INCB_MATCHED: + return False + + # GB11: Do not break within emoji ZWJ sequences. + if ep_state == _EP_MATCHED: + return False + + # GB12/GB13: Do not break within emoji flag sequences. + if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator: + return ri_flag + + # GB999: Otherwise, break everywhere. + return True + + +def iter_graphemes(string, start=0, end=sys.maxsize): + """Iterate over grapheme clusters in a string. + + Uses extended grapheme cluster rules from TR29. + + Returns an iterator yielding Segment objects with start/end attributes + and str() support. + """ + if not isinstance(string, str): + raise TypeError( + "argument must be a unicode character, not " + f"'{type(string).__name__}'" + ) + + length = len(string) + # Adjust indices (matching CPython's ADJUST_INDICES macro) + if end > length: + end = length + if end < 0: + end += length + if end < 0: + end = 0 + if start < 0: + start += length + if start < 0: + start = 0 + + return _iter_grapheme_clusters(string, start, end) + + +def _iter_grapheme_clusters(string, start, end): + gcb = _GCB_Other + ep_state = _EP_INIT + incb_state = _INCB_INIT + ri_flag = False + + cluster_start = start + pos = start + while pos < end: + ch = string[pos] + curr_gcb = unicodedata.grapheme_cluster_break(ch) + ext_pict = unicodedata.extended_pictographic(ch) + incb = unicodedata.indic_conjunct_break(ch) + + ep_state = _update_ext_pict_state(ep_state, curr_gcb, ext_pict) + ri_flag = (not ri_flag) if curr_gcb == _GCB_Regional_Indicator else False + incb_state = _update_incb_state(incb_state, incb) + + prev_gcb = gcb + gcb = curr_gcb + + if pos != cluster_start and _grapheme_break( + prev_gcb, curr_gcb, ep_state, ri_flag, incb_state + ): + yield Segment(string, cluster_start, pos) + cluster_start = pos + + pos += 1 + + if cluster_start < end: + yield Segment(string, cluster_start, end) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 8ecb0df2f8e5dd..330f1be0dcde15 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -1300,16 +1300,103 @@ class MyStr(str): self.assertIs(type(normalize(form, MyStr(input_str))), str) -class GraphemeBreakTest(unittest.TestCase): +class BaseGraphemeBreakTest: + iter_graphemes = staticmethod(unicodedata.iter_graphemes) + + def test_grapheme_break_types(self): + self.assertRaises(TypeError, self.iter_graphemes) + self.assertRaises(TypeError, self.iter_graphemes, b'x') + + def test_grapheme_break_empty(self): + graphemes = self._graphemes + self.assertEqual(graphemes(''), []) + + def test_grapheme_break_simple(self): + graphemes = self._graphemes + self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd']) + self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd']) + self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c']) + self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd']) + self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c']) + self.assertEqual(graphemes('abcd', 3, 1), []) + self.assertEqual(graphemes('abcd', 5), []) + self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd']) + self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd']) + self.assertEqual(graphemes('abcd', 0, -5), []) + + def test_grapheme_break_rules(self): + graphemes = self._graphemes + # GB3 + self.assertEqual(graphemes('\r\n'), ['\r\n']) + # GB4 + self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308']) + self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308']) + self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308']) + # GB5 + self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r']) + self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n']) + self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0']) + # GB6 + self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160']) + self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00']) + self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01']) + # GB7 + self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160']) + self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8']) + self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160']) + self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8']) + # GB8 + self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8']) + self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8']) + # GB9 + self.assertEqual(graphemes('a\u0300'), ['a\u0300']) + self.assertEqual(graphemes('a\u200D'), ['a\u200D']) + # GB9a + self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903']) + # GB9b + self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661']) + # GB9c + self.assertEqual(graphemes('\u0915\u094d\u0924'), + ['\u0915\u094d\u0924']) + self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'), + ['\u0915\u094D\u094D\u0924']) + self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'), + ['\u0915\u094D\u0924\u094D\u092F']) + # GB11 + self.assertEqual(graphemes( + '\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F' + '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'), + ['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F' + '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC']) + # GB12 + self.assertEqual(graphemes( + '\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'), + ['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3']) + # GB13 + self.assertEqual(graphemes( + 'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'), + ['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3']) + + def test_segment_object(self): + segments = list(self.iter_graphemes('spa\u0300m')) + self.assertEqual(len(segments), 4, segments) + segment = segments[2] + self.assertEqual(segment.start, 2) + self.assertEqual(segment.end, 4) + self.assertEqual(str(segment), 'a\u0300') + + def _graphemes(self, *args): + return list(map(str, self.iter_graphemes(*args))) + @requires_resource('network') - def test_grapheme_break(self): + def test_tr29_conformance(self): TESTDATAFILE = "GraphemeBreakTest.txt" testdata = download_test_data_file(TESTDATAFILE) with testdata: - self.run_grapheme_break_tests(testdata) + self._run_grapheme_break_tests(testdata) - def run_grapheme_break_tests(self, testdata): + def _run_grapheme_break_tests(self, testdata): for line in testdata: line, _, comment = line.partition('#') line = line.strip() @@ -1330,19 +1417,32 @@ def run_grapheme_break_tests(self, testdata): self.assertEqual(chunks.pop(), '', line) input = ''.join(chunks) with self.subTest(line): - result = list(unicodedata.iter_graphemes(input)) + result = list(self.iter_graphemes(input)) self.assertEqual(list(map(str, result)), chunks, comment) - self.assertEqual([x.start for x in result], breaks[:-1], comment) - self.assertEqual([x.end for x in result], breaks[1:], comment) + self.assertEqual([x.start for x in result], + breaks[:-1], comment) + self.assertEqual([x.end for x in result], + breaks[1:], comment) for i in range(1, len(breaks) - 1): - result = list(unicodedata.iter_graphemes(input, breaks[i])) - self.assertEqual(list(map(str, result)), chunks[i:], comment) - self.assertEqual([x.start for x in result], breaks[i:-1], comment) - self.assertEqual([x.end for x in result], breaks[i+1:], comment) + result = list(self.iter_graphemes(input, breaks[i])) + self.assertEqual(list(map(str, result)), + chunks[i:], comment) + self.assertEqual([x.start for x in result], + breaks[i:-1], comment) + self.assertEqual([x.end for x in result], + breaks[i+1:], comment) + + +class GraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest): + iter_graphemes = staticmethod(unicodedata.iter_graphemes) + + def test_segment_repr(self): + segment = list(unicodedata.iter_graphemes('spa\u0300m'))[2] + self.assertEqual(repr(segment), '') + self.assertRaises(TypeError, iter, segment) + self.assertRaises(TypeError, len, segment) def test_reference_loops(self): - # Test that reference loops involving GraphemeBreakIterator or - # Segment can be broken by the garbage collector. class S(str): pass @@ -1363,5 +1463,12 @@ class S(str): self.assertIsNone(wr()) +class PyGraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest): + @classmethod + def setUpClass(cls): + from _py_grapheme import iter_graphemes + cls.iter_graphemes = staticmethod(iter_graphemes) + + if __name__ == "__main__": unittest.main() From 6262980c3aca3f013ddac83fc3a0d283cf5680af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 7 Apr 2026 16:15:19 +0200 Subject: [PATCH 2/6] Add _py_grapheme to stdlib_module_names.h --- Python/stdlib_module_names.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h index 8937e666bbbdd5..e5f73e638f019f 100644 --- a/Python/stdlib_module_names.h +++ b/Python/stdlib_module_names.h @@ -64,6 +64,7 @@ static const char* _Py_stdlib_module_names[] = { "_posixshmem", "_posixsubprocess", "_py_abc", +"_py_grapheme", "_py_warnings", "_pydatetime", "_pydecimal", From 38db42282e52b46c8c59753e3fa322d04999bc12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 7 Apr 2026 16:27:47 +0200 Subject: [PATCH 3/6] Make the first argument positional-only --- Lib/_py_grapheme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_py_grapheme.py b/Lib/_py_grapheme.py index 95565caab3fedc..e59091c2a2fd61 100644 --- a/Lib/_py_grapheme.py +++ b/Lib/_py_grapheme.py @@ -135,7 +135,7 @@ def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state): return True -def iter_graphemes(string, start=0, end=sys.maxsize): +def iter_graphemes(string, /, start=0, end=sys.maxsize): """Iterate over grapheme clusters in a string. Uses extended grapheme cluster rules from TR29. From 70bdb569cffc84c7fbfa9769f1661cec23f7adaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 7 Apr 2026 16:58:17 +0200 Subject: [PATCH 4/6] Make _py_grapheme standalone by generating property tables Add makegraphemedata() to Tools/unicode/makeunicodedata.py that generates Lib/_py_grapheme_db.py from the Unicode data files (GraphemeBreakProperty.txt, emoji-data.txt, DerivedCoreProperties.txt). _py_grapheme.py now imports property tables from _py_grapheme_db and uses bisect for lookups instead of calling unicodedata functions added in 3.15. This makes the module usable on Python 3.13 and 3.14 by regenerating the tables for the appropriate Unicode version. Co-Authored-By: Claude Opus 4.6 (1M context) --- Lib/_py_grapheme.py | 177 ++++++++++------ Lib/_py_grapheme_db.py | 345 +++++++++++++++++++++++++++++++ Python/stdlib_module_names.h | 1 + Tools/unicode/makeunicodedata.py | 128 ++++++++++++ 4 files changed, 590 insertions(+), 61 deletions(-) create mode 100644 Lib/_py_grapheme_db.py diff --git a/Lib/_py_grapheme.py b/Lib/_py_grapheme.py index e59091c2a2fd61..9cc6194788e5a2 100644 --- a/Lib/_py_grapheme.py +++ b/Lib/_py_grapheme.py @@ -1,10 +1,23 @@ """Pure Python implementation of unicodedata.iter_graphemes(). Uses the extended grapheme cluster rules from Unicode TR29. + +Property tables are in _py_grapheme_db.py, generated by +Tools/unicode/makeunicodedata.py from the Unicode Character Database. """ import sys -import unicodedata +from bisect import bisect_right + +from _py_grapheme_db import ( + GCB_RANGES, + GCB_Other, GCB_Prepend, GCB_CR, GCB_LF, GCB_Control, + GCB_Extend, GCB_Regional_Indicator, GCB_SpacingMark, + GCB_L, GCB_V, GCB_T, GCB_LV, GCB_LVT, GCB_ZWJ, + EXT_PICT_RANGES, + INCB_RANGES, + InCB_None, InCB_Linker, InCB_Consonant, InCB_Extend, +) class Segment: @@ -24,28 +37,6 @@ def __repr__(self): return f"" -# Grapheme_Cluster_Break property values (matching C #defines) -_GCB_Other = "Other" -_GCB_Prepend = "Prepend" -_GCB_CR = "CR" -_GCB_LF = "LF" -_GCB_Control = "Control" -_GCB_Extend = "Extend" -_GCB_Regional_Indicator = "Regional_Indicator" -_GCB_SpacingMark = "SpacingMark" -_GCB_L = "L" -_GCB_V = "V" -_GCB_T = "T" -_GCB_LV = "LV" -_GCB_LVT = "LVT" -_GCB_ZWJ = "ZWJ" - -# Indic_Conjunct_Break property values -_InCB_None = "None" -_InCB_Linker = "Linker" -_InCB_Consonant = "Consonant" -_InCB_Extend = "Extend" - # Extended Pictographic FSM states (for GB11) _EP_INIT = 0 _EP_STARTED = 1 @@ -58,65 +49,97 @@ def __repr__(self): _INCB_LINKER = 2 _INCB_MATCHED = 3 +# Hangul syllable constants +_HANGUL_S_BASE = 0xAC00 +_HANGUL_S_COUNT = 11172 +_HANGUL_T_COUNT = 28 + +# Precomputed start arrays for bisect lookup +_GCB_STARTS = tuple(r[0] for r in GCB_RANGES) +_EXT_PICT_STARTS = tuple(r[0] for r in EXT_PICT_RANGES) +_INCB_STARTS = tuple(r[0] for r in INCB_RANGES) + -def _update_ext_pict_state(state, gcb, ext_pict): - if ext_pict: - return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED - if state == _EP_STARTED or state == _EP_MATCHED: - if gcb == _GCB_Extend: - return _EP_STARTED - if gcb == _GCB_ZWJ: - return _EP_ZWJ - return _EP_INIT +# --------------------------------------------------------------------------- +# Property lookup functions +# --------------------------------------------------------------------------- +def _get_gcb(cp): + """Return the Grapheme_Cluster_Break value for a codepoint.""" + idx = bisect_right(_GCB_STARTS, cp) - 1 + if idx >= 0: + entry = GCB_RANGES[idx] + if cp <= entry[1]: + return entry[2] + # Hangul syllables: LV if (cp - S_BASE) % T_COUNT == 0, else LVT + if _HANGUL_S_BASE <= cp < _HANGUL_S_BASE + _HANGUL_S_COUNT: + if (cp - _HANGUL_S_BASE) % _HANGUL_T_COUNT == 0: + return GCB_LV + return GCB_LVT + return GCB_Other -def _update_incb_state(state, incb): - if incb == _InCB_Consonant: - return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED - if state != _INCB_INIT: - if incb == _InCB_Extend: - return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED - if incb == _InCB_Linker: - return _INCB_LINKER - return _INCB_INIT +def _get_ext_pict(cp): + """Return True if the codepoint has the Extended_Pictographic property.""" + idx = bisect_right(_EXT_PICT_STARTS, cp) - 1 + return idx >= 0 and cp <= EXT_PICT_RANGES[idx][1] + + +def _get_incb(cp): + """Return the Indic_Conjunct_Break value for a codepoint.""" + idx = bisect_right(_INCB_STARTS, cp) - 1 + if idx >= 0: + entry = INCB_RANGES[idx] + if cp <= entry[1]: + return entry[2] + return InCB_None + + +# --------------------------------------------------------------------------- +# Grapheme break algorithm (TR29) +# --------------------------------------------------------------------------- def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state): """Return True if a grapheme cluster break occurs between two characters.""" # GB3: Do not break between a CR and LF. - if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF: + if prev_gcb == GCB_CR and curr_gcb == GCB_LF: return False # GB4: Break after controls. - if prev_gcb in (_GCB_CR, _GCB_LF, _GCB_Control): + if prev_gcb == GCB_CR or prev_gcb == GCB_LF or prev_gcb == GCB_Control: return True # GB5: Break before controls. - if curr_gcb in (_GCB_CR, _GCB_LF, _GCB_Control): + if curr_gcb == GCB_CR or curr_gcb == GCB_LF or curr_gcb == GCB_Control: return True # GB6: Do not break Hangul syllable sequences (L). - if prev_gcb == _GCB_L and curr_gcb in (_GCB_L, _GCB_V, _GCB_LV, _GCB_LVT): + if prev_gcb == GCB_L and ( + curr_gcb == GCB_L or curr_gcb == GCB_V + or curr_gcb == GCB_LV or curr_gcb == GCB_LVT + ): return False # GB7: Do not break Hangul syllable sequences (LV, V). - if prev_gcb in (_GCB_LV, _GCB_V) and curr_gcb in (_GCB_V, _GCB_T): + if (prev_gcb == GCB_LV or prev_gcb == GCB_V) and ( + curr_gcb == GCB_V or curr_gcb == GCB_T + ): return False # GB8: Do not break Hangul syllable sequences (LVT, T). - if prev_gcb in (_GCB_LVT, _GCB_T) and curr_gcb == _GCB_T: + if (prev_gcb == GCB_LVT or prev_gcb == GCB_T) and curr_gcb == GCB_T: return False # GB9: Do not break before extending characters or ZWJ. - if curr_gcb in (_GCB_Extend, _GCB_ZWJ): + if curr_gcb == GCB_Extend or curr_gcb == GCB_ZWJ: return False # GB9a: Do not break before SpacingMarks. - if curr_gcb == _GCB_SpacingMark: + if curr_gcb == GCB_SpacingMark: return False # GB9b: Do not break after Prepend characters. - if prev_gcb == _GCB_Prepend: + if prev_gcb == GCB_Prepend: return False # GB9c: Do not break within Indic conjunct clusters. @@ -128,13 +151,17 @@ def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state): return False # GB12/GB13: Do not break within emoji flag sequences. - if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator: + if prev_gcb == GCB_Regional_Indicator and curr_gcb == GCB_Regional_Indicator: return ri_flag # GB999: Otherwise, break everywhere. return True +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + def iter_graphemes(string, /, start=0, end=sys.maxsize): """Iterate over grapheme clusters in a string. @@ -150,7 +177,6 @@ def iter_graphemes(string, /, start=0, end=sys.maxsize): ) length = len(string) - # Adjust indices (matching CPython's ADJUST_INDICES macro) if end > length: end = length if end < 0: @@ -166,7 +192,7 @@ def iter_graphemes(string, /, start=0, end=sys.maxsize): def _iter_grapheme_clusters(string, start, end): - gcb = _GCB_Other + gcb = GCB_Other ep_state = _EP_INIT incb_state = _INCB_INIT ri_flag = False @@ -174,14 +200,43 @@ def _iter_grapheme_clusters(string, start, end): cluster_start = start pos = start while pos < end: - ch = string[pos] - curr_gcb = unicodedata.grapheme_cluster_break(ch) - ext_pict = unicodedata.extended_pictographic(ch) - incb = unicodedata.indic_conjunct_break(ch) - - ep_state = _update_ext_pict_state(ep_state, curr_gcb, ext_pict) - ri_flag = (not ri_flag) if curr_gcb == _GCB_Regional_Indicator else False - incb_state = _update_incb_state(incb_state, incb) + cp = ord(string[pos]) + curr_gcb = _get_gcb(cp) + + # Update Extended Pictographic FSM (GB11) + ext_pict = _get_ext_pict(cp) + if ext_pict: + ep_state = _EP_MATCHED if ep_state == _EP_ZWJ else _EP_STARTED + elif ep_state == _EP_STARTED or ep_state == _EP_MATCHED: + if curr_gcb == GCB_Extend: + ep_state = _EP_STARTED + elif curr_gcb == GCB_ZWJ: + ep_state = _EP_ZWJ + else: + ep_state = _EP_INIT + else: + ep_state = _EP_INIT + + # Update Regional Indicator flag (GB12/GB13) + ri_flag = not ri_flag if curr_gcb == GCB_Regional_Indicator else False + + # Update Indic Conjunct Break FSM (GB9c) + curr_incb = _get_incb(cp) + if curr_incb == InCB_Consonant: + incb_state = ( + _INCB_MATCHED if incb_state == _INCB_LINKER else _INCB_STARTED + ) + elif incb_state != _INCB_INIT: + if curr_incb == InCB_Extend: + incb_state = ( + _INCB_LINKER if incb_state == _INCB_LINKER else _INCB_STARTED + ) + elif curr_incb == InCB_Linker: + incb_state = _INCB_LINKER + else: + incb_state = _INCB_INIT + else: + incb_state = _INCB_INIT prev_gcb = gcb gcb = curr_gcb diff --git a/Lib/_py_grapheme_db.py b/Lib/_py_grapheme_db.py new file mode 100644 index 00000000000000..23a5c726b91530 --- /dev/null +++ b/Lib/_py_grapheme_db.py @@ -0,0 +1,345 @@ +# This file was generated by Tools/unicode/makeunicodedata.py 3.3. +# Unicode version: 17.0.0. +# +# Property tables for the pure Python grapheme cluster break +# implementation in _py_grapheme.py. +# +# Each table is a tuple of (start, end, value) ranges sorted +# by start codepoint. The default value for codepoints not +# covered is 0. + +UNIDATA_VERSION = '17.0.0' + +# Grapheme_Cluster_Break values +GCB_Other = 0 +GCB_Prepend = 1 +GCB_CR = 2 +GCB_LF = 3 +GCB_Control = 4 +GCB_Extend = 5 +GCB_Regional_Indicator = 6 +GCB_SpacingMark = 7 +GCB_L = 8 +GCB_V = 9 +GCB_T = 10 +GCB_LV = 11 +GCB_LVT = 12 +GCB_ZWJ = 13 + +# Indic_Conjunct_Break values +InCB_None = 0 +InCB_Linker = 1 +InCB_Consonant = 2 +InCB_Extend = 3 + +# Grapheme_Cluster_Break ranges +# (excludes LV/LVT, computed via Hangul syllable arithmetic) +GCB_RANGES = ( + (0x0000,0x0009,4),(0x000A,0x000A,3),(0x000B,0x000C,4),(0x000D,0x000D,2), + (0x000E,0x001F,4),(0x007F,0x009F,4),(0x00AD,0x00AD,4),(0x0300,0x036F,5), + (0x0483,0x0489,5),(0x0591,0x05BD,5),(0x05BF,0x05BF,5),(0x05C1,0x05C2,5), + (0x05C4,0x05C5,5),(0x05C7,0x05C7,5),(0x0600,0x0605,1),(0x0610,0x061A,5), + (0x061C,0x061C,4),(0x064B,0x065F,5),(0x0670,0x0670,5),(0x06D6,0x06DC,5), + (0x06DD,0x06DD,1),(0x06DF,0x06E4,5),(0x06E7,0x06E8,5),(0x06EA,0x06ED,5), + (0x070F,0x070F,1),(0x0711,0x0711,5),(0x0730,0x074A,5),(0x07A6,0x07B0,5), + (0x07EB,0x07F3,5),(0x07FD,0x07FD,5),(0x0816,0x0819,5),(0x081B,0x0823,5), + (0x0825,0x0827,5),(0x0829,0x082D,5),(0x0859,0x085B,5),(0x0890,0x0891,1), + (0x0897,0x089F,5),(0x08CA,0x08E1,5),(0x08E2,0x08E2,1),(0x08E3,0x0902,5), + (0x0903,0x0903,7),(0x093A,0x093A,5),(0x093B,0x093B,7),(0x093C,0x093C,5), + (0x093E,0x0940,7),(0x0941,0x0948,5),(0x0949,0x094C,7),(0x094D,0x094D,5), + (0x094E,0x094F,7),(0x0951,0x0957,5),(0x0962,0x0963,5),(0x0981,0x0981,5), + (0x0982,0x0983,7),(0x09BC,0x09BC,5),(0x09BE,0x09BE,5),(0x09BF,0x09C0,7), + (0x09C1,0x09C4,5),(0x09C7,0x09C8,7),(0x09CB,0x09CC,7),(0x09CD,0x09CD,5), + (0x09D7,0x09D7,5),(0x09E2,0x09E3,5),(0x09FE,0x09FE,5),(0x0A01,0x0A02,5), + (0x0A03,0x0A03,7),(0x0A3C,0x0A3C,5),(0x0A3E,0x0A40,7),(0x0A41,0x0A42,5), + (0x0A47,0x0A48,5),(0x0A4B,0x0A4D,5),(0x0A51,0x0A51,5),(0x0A70,0x0A71,5), + (0x0A75,0x0A75,5),(0x0A81,0x0A82,5),(0x0A83,0x0A83,7),(0x0ABC,0x0ABC,5), + (0x0ABE,0x0AC0,7),(0x0AC1,0x0AC5,5),(0x0AC7,0x0AC8,5),(0x0AC9,0x0AC9,7), + (0x0ACB,0x0ACC,7),(0x0ACD,0x0ACD,5),(0x0AE2,0x0AE3,5),(0x0AFA,0x0AFF,5), + (0x0B01,0x0B01,5),(0x0B02,0x0B03,7),(0x0B3C,0x0B3C,5),(0x0B3E,0x0B3F,5), + (0x0B40,0x0B40,7),(0x0B41,0x0B44,5),(0x0B47,0x0B48,7),(0x0B4B,0x0B4C,7), + (0x0B4D,0x0B4D,5),(0x0B55,0x0B57,5),(0x0B62,0x0B63,5),(0x0B82,0x0B82,5), + (0x0BBE,0x0BBE,5),(0x0BBF,0x0BBF,7),(0x0BC0,0x0BC0,5),(0x0BC1,0x0BC2,7), + (0x0BC6,0x0BC8,7),(0x0BCA,0x0BCC,7),(0x0BCD,0x0BCD,5),(0x0BD7,0x0BD7,5), + (0x0C00,0x0C00,5),(0x0C01,0x0C03,7),(0x0C04,0x0C04,5),(0x0C3C,0x0C3C,5), + (0x0C3E,0x0C40,5),(0x0C41,0x0C44,7),(0x0C46,0x0C48,5),(0x0C4A,0x0C4D,5), + (0x0C55,0x0C56,5),(0x0C62,0x0C63,5),(0x0C81,0x0C81,5),(0x0C82,0x0C83,7), + (0x0CBC,0x0CBC,5),(0x0CBE,0x0CBE,7),(0x0CBF,0x0CC0,5),(0x0CC1,0x0CC1,7), + (0x0CC2,0x0CC2,5),(0x0CC3,0x0CC4,7),(0x0CC6,0x0CC8,5),(0x0CCA,0x0CCD,5), + (0x0CD5,0x0CD6,5),(0x0CE2,0x0CE3,5),(0x0CF3,0x0CF3,7),(0x0D00,0x0D01,5), + (0x0D02,0x0D03,7),(0x0D3B,0x0D3C,5),(0x0D3E,0x0D3E,5),(0x0D3F,0x0D40,7), + (0x0D41,0x0D44,5),(0x0D46,0x0D48,7),(0x0D4A,0x0D4C,7),(0x0D4D,0x0D4D,5), + (0x0D4E,0x0D4E,1),(0x0D57,0x0D57,5),(0x0D62,0x0D63,5),(0x0D81,0x0D81,5), + (0x0D82,0x0D83,7),(0x0DCA,0x0DCA,5),(0x0DCF,0x0DCF,5),(0x0DD0,0x0DD1,7), + (0x0DD2,0x0DD4,5),(0x0DD6,0x0DD6,5),(0x0DD8,0x0DDE,7),(0x0DDF,0x0DDF,5), + (0x0DF2,0x0DF3,7),(0x0E31,0x0E31,5),(0x0E33,0x0E33,7),(0x0E34,0x0E3A,5), + (0x0E47,0x0E4E,5),(0x0EB1,0x0EB1,5),(0x0EB3,0x0EB3,7),(0x0EB4,0x0EBC,5), + (0x0EC8,0x0ECE,5),(0x0F18,0x0F19,5),(0x0F35,0x0F35,5),(0x0F37,0x0F37,5), + (0x0F39,0x0F39,5),(0x0F3E,0x0F3F,7),(0x0F71,0x0F7E,5),(0x0F7F,0x0F7F,7), + (0x0F80,0x0F84,5),(0x0F86,0x0F87,5),(0x0F8D,0x0F97,5),(0x0F99,0x0FBC,5), + (0x0FC6,0x0FC6,5),(0x102D,0x1030,5),(0x1031,0x1031,7),(0x1032,0x1037,5), + (0x1039,0x103A,5),(0x103B,0x103C,7),(0x103D,0x103E,5),(0x1056,0x1057,7), + (0x1058,0x1059,5),(0x105E,0x1060,5),(0x1071,0x1074,5),(0x1082,0x1082,5), + (0x1084,0x1084,7),(0x1085,0x1086,5),(0x108D,0x108D,5),(0x109D,0x109D,5), + (0x1100,0x115F,8),(0x1160,0x11A7,9),(0x11A8,0x11FF,10),(0x135D,0x135F,5), + (0x1712,0x1715,5),(0x1732,0x1734,5),(0x1752,0x1753,5),(0x1772,0x1773,5), + (0x17B4,0x17B5,5),(0x17B6,0x17B6,7),(0x17B7,0x17BD,5),(0x17BE,0x17C5,7), + (0x17C6,0x17C6,5),(0x17C7,0x17C8,7),(0x17C9,0x17D3,5),(0x17DD,0x17DD,5), + (0x180B,0x180D,5),(0x180E,0x180E,4),(0x180F,0x180F,5),(0x1885,0x1886,5), + (0x18A9,0x18A9,5),(0x1920,0x1922,5),(0x1923,0x1926,7),(0x1927,0x1928,5), + (0x1929,0x192B,7),(0x1930,0x1931,7),(0x1932,0x1932,5),(0x1933,0x1938,7), + (0x1939,0x193B,5),(0x1A17,0x1A18,5),(0x1A19,0x1A1A,7),(0x1A1B,0x1A1B,5), + (0x1A55,0x1A55,7),(0x1A56,0x1A56,5),(0x1A57,0x1A57,7),(0x1A58,0x1A5E,5), + (0x1A60,0x1A60,5),(0x1A62,0x1A62,5),(0x1A65,0x1A6C,5),(0x1A6D,0x1A72,7), + (0x1A73,0x1A7C,5),(0x1A7F,0x1A7F,5),(0x1AB0,0x1ADD,5),(0x1AE0,0x1AEB,5), + (0x1B00,0x1B03,5),(0x1B04,0x1B04,7),(0x1B34,0x1B3D,5),(0x1B3E,0x1B41,7), + (0x1B42,0x1B44,5),(0x1B6B,0x1B73,5),(0x1B80,0x1B81,5),(0x1B82,0x1B82,7), + (0x1BA1,0x1BA1,7),(0x1BA2,0x1BA5,5),(0x1BA6,0x1BA7,7),(0x1BA8,0x1BAD,5), + (0x1BE6,0x1BE6,5),(0x1BE7,0x1BE7,7),(0x1BE8,0x1BE9,5),(0x1BEA,0x1BEC,7), + (0x1BED,0x1BED,5),(0x1BEE,0x1BEE,7),(0x1BEF,0x1BF3,5),(0x1C24,0x1C2B,7), + (0x1C2C,0x1C33,5),(0x1C34,0x1C35,7),(0x1C36,0x1C37,5),(0x1CD0,0x1CD2,5), + (0x1CD4,0x1CE0,5),(0x1CE1,0x1CE1,7),(0x1CE2,0x1CE8,5),(0x1CED,0x1CED,5), + (0x1CF4,0x1CF4,5),(0x1CF7,0x1CF7,7),(0x1CF8,0x1CF9,5),(0x1DC0,0x1DFF,5), + (0x200B,0x200B,4),(0x200C,0x200C,5),(0x200D,0x200D,13),(0x200E,0x200F,4), + (0x2028,0x202E,4),(0x2060,0x206F,4),(0x20D0,0x20F0,5),(0x2CEF,0x2CF1,5), + (0x2D7F,0x2D7F,5),(0x2DE0,0x2DFF,5),(0x302A,0x302F,5),(0x3099,0x309A,5), + (0xA66F,0xA672,5),(0xA674,0xA67D,5),(0xA69E,0xA69F,5),(0xA6F0,0xA6F1,5), + (0xA802,0xA802,5),(0xA806,0xA806,5),(0xA80B,0xA80B,5),(0xA823,0xA824,7), + (0xA825,0xA826,5),(0xA827,0xA827,7),(0xA82C,0xA82C,5),(0xA880,0xA881,7), + (0xA8B4,0xA8C3,7),(0xA8C4,0xA8C5,5),(0xA8E0,0xA8F1,5),(0xA8FF,0xA8FF,5), + (0xA926,0xA92D,5),(0xA947,0xA951,5),(0xA952,0xA952,7),(0xA953,0xA953,5), + (0xA960,0xA97C,8),(0xA980,0xA982,5),(0xA983,0xA983,7),(0xA9B3,0xA9B3,5), + (0xA9B4,0xA9B5,7),(0xA9B6,0xA9B9,5),(0xA9BA,0xA9BB,7),(0xA9BC,0xA9BD,5), + (0xA9BE,0xA9BF,7),(0xA9C0,0xA9C0,5),(0xA9E5,0xA9E5,5),(0xAA29,0xAA2E,5), + (0xAA2F,0xAA30,7),(0xAA31,0xAA32,5),(0xAA33,0xAA34,7),(0xAA35,0xAA36,5), + (0xAA43,0xAA43,5),(0xAA4C,0xAA4C,5),(0xAA4D,0xAA4D,7),(0xAA7C,0xAA7C,5), + (0xAAB0,0xAAB0,5),(0xAAB2,0xAAB4,5),(0xAAB7,0xAAB8,5),(0xAABE,0xAABF,5), + (0xAAC1,0xAAC1,5),(0xAAEB,0xAAEB,7),(0xAAEC,0xAAED,5),(0xAAEE,0xAAEF,7), + (0xAAF5,0xAAF5,7),(0xAAF6,0xAAF6,5),(0xABE3,0xABE4,7),(0xABE5,0xABE5,5), + (0xABE6,0xABE7,7),(0xABE8,0xABE8,5),(0xABE9,0xABEA,7),(0xABEC,0xABEC,7), + (0xABED,0xABED,5),(0xD7B0,0xD7C6,9),(0xD7CB,0xD7FB,10),(0xFB1E,0xFB1E,5), + (0xFE00,0xFE0F,5),(0xFE20,0xFE2F,5),(0xFEFF,0xFEFF,4),(0xFF9E,0xFF9F,5), + (0xFFF0,0xFFFB,4),(0x101FD,0x101FD,5),(0x102E0,0x102E0,5),(0x10376,0x1037A,5), + (0x10A01,0x10A03,5),(0x10A05,0x10A06,5),(0x10A0C,0x10A0F,5),(0x10A38,0x10A3A,5), + (0x10A3F,0x10A3F,5),(0x10AE5,0x10AE6,5),(0x10D24,0x10D27,5),(0x10D69,0x10D6D,5), + (0x10EAB,0x10EAC,5),(0x10EFA,0x10EFF,5),(0x10F46,0x10F50,5),(0x10F82,0x10F85,5), + (0x11000,0x11000,7),(0x11001,0x11001,5),(0x11002,0x11002,7),(0x11038,0x11046,5), + (0x11070,0x11070,5),(0x11073,0x11074,5),(0x1107F,0x11081,5),(0x11082,0x11082,7), + (0x110B0,0x110B2,7),(0x110B3,0x110B6,5),(0x110B7,0x110B8,7),(0x110B9,0x110BA,5), + (0x110BD,0x110BD,1),(0x110C2,0x110C2,5),(0x110CD,0x110CD,1),(0x11100,0x11102,5), + (0x11127,0x1112B,5),(0x1112C,0x1112C,7),(0x1112D,0x11134,5),(0x11145,0x11146,7), + (0x11173,0x11173,5),(0x11180,0x11181,5),(0x11182,0x11182,7),(0x111B3,0x111B5,7), + (0x111B6,0x111BE,5),(0x111BF,0x111BF,7),(0x111C0,0x111C0,5),(0x111C2,0x111C3,1), + (0x111C9,0x111CC,5),(0x111CE,0x111CE,7),(0x111CF,0x111CF,5),(0x1122C,0x1122E,7), + (0x1122F,0x11231,5),(0x11232,0x11233,7),(0x11234,0x11237,5),(0x1123E,0x1123E,5), + (0x11241,0x11241,5),(0x112DF,0x112DF,5),(0x112E0,0x112E2,7),(0x112E3,0x112EA,5), + (0x11300,0x11301,5),(0x11302,0x11303,7),(0x1133B,0x1133C,5),(0x1133E,0x1133E,5), + (0x1133F,0x1133F,7),(0x11340,0x11340,5),(0x11341,0x11344,7),(0x11347,0x11348,7), + (0x1134B,0x1134C,7),(0x1134D,0x1134D,5),(0x11357,0x11357,5),(0x11362,0x11363,7), + (0x11366,0x1136C,5),(0x11370,0x11374,5),(0x113B8,0x113B8,5),(0x113B9,0x113BA,7), + (0x113BB,0x113C0,5),(0x113C2,0x113C2,5),(0x113C5,0x113C5,5),(0x113C7,0x113C9,5), + (0x113CA,0x113CA,7),(0x113CC,0x113CD,7),(0x113CE,0x113D0,5),(0x113D1,0x113D1,1), + (0x113D2,0x113D2,5),(0x113E1,0x113E2,5),(0x11435,0x11437,7),(0x11438,0x1143F,5), + (0x11440,0x11441,7),(0x11442,0x11444,5),(0x11445,0x11445,7),(0x11446,0x11446,5), + (0x1145E,0x1145E,5),(0x114B0,0x114B0,5),(0x114B1,0x114B2,7),(0x114B3,0x114B8,5), + (0x114B9,0x114B9,7),(0x114BA,0x114BA,5),(0x114BB,0x114BC,7),(0x114BD,0x114BD,5), + (0x114BE,0x114BE,7),(0x114BF,0x114C0,5),(0x114C1,0x114C1,7),(0x114C2,0x114C3,5), + (0x115AF,0x115AF,5),(0x115B0,0x115B1,7),(0x115B2,0x115B5,5),(0x115B8,0x115BB,7), + (0x115BC,0x115BD,5),(0x115BE,0x115BE,7),(0x115BF,0x115C0,5),(0x115DC,0x115DD,5), + (0x11630,0x11632,7),(0x11633,0x1163A,5),(0x1163B,0x1163C,7),(0x1163D,0x1163D,5), + (0x1163E,0x1163E,7),(0x1163F,0x11640,5),(0x116AB,0x116AB,5),(0x116AC,0x116AC,7), + (0x116AD,0x116AD,5),(0x116AE,0x116AF,7),(0x116B0,0x116B7,5),(0x1171D,0x1171D,5), + (0x1171E,0x1171E,7),(0x1171F,0x1171F,5),(0x11722,0x11725,5),(0x11726,0x11726,7), + (0x11727,0x1172B,5),(0x1182C,0x1182E,7),(0x1182F,0x11837,5),(0x11838,0x11838,7), + (0x11839,0x1183A,5),(0x11930,0x11930,5),(0x11931,0x11935,7),(0x11937,0x11938,7), + (0x1193B,0x1193E,5),(0x1193F,0x1193F,1),(0x11940,0x11940,7),(0x11941,0x11941,1), + (0x11942,0x11942,7),(0x11943,0x11943,5),(0x119D1,0x119D3,7),(0x119D4,0x119D7,5), + (0x119DA,0x119DB,5),(0x119DC,0x119DF,7),(0x119E0,0x119E0,5),(0x119E4,0x119E4,7), + (0x11A01,0x11A0A,5),(0x11A33,0x11A38,5),(0x11A39,0x11A39,7),(0x11A3B,0x11A3E,5), + (0x11A47,0x11A47,5),(0x11A51,0x11A56,5),(0x11A57,0x11A58,7),(0x11A59,0x11A5B,5), + (0x11A84,0x11A89,1),(0x11A8A,0x11A96,5),(0x11A97,0x11A97,7),(0x11A98,0x11A99,5), + (0x11B60,0x11B60,5),(0x11B61,0x11B61,7),(0x11B62,0x11B64,5),(0x11B65,0x11B65,7), + (0x11B66,0x11B66,5),(0x11B67,0x11B67,7),(0x11C2F,0x11C2F,7),(0x11C30,0x11C36,5), + (0x11C38,0x11C3D,5),(0x11C3E,0x11C3E,7),(0x11C3F,0x11C3F,5),(0x11C92,0x11CA7,5), + (0x11CA9,0x11CA9,7),(0x11CAA,0x11CB0,5),(0x11CB1,0x11CB1,7),(0x11CB2,0x11CB3,5), + (0x11CB4,0x11CB4,7),(0x11CB5,0x11CB6,5),(0x11D31,0x11D36,5),(0x11D3A,0x11D3A,5), + (0x11D3C,0x11D3D,5),(0x11D3F,0x11D45,5),(0x11D46,0x11D46,1),(0x11D47,0x11D47,5), + (0x11D8A,0x11D8E,7),(0x11D90,0x11D91,5),(0x11D93,0x11D94,7),(0x11D95,0x11D95,5), + (0x11D96,0x11D96,7),(0x11D97,0x11D97,5),(0x11EF3,0x11EF4,5),(0x11EF5,0x11EF6,7), + (0x11F00,0x11F01,5),(0x11F02,0x11F02,1),(0x11F03,0x11F03,7),(0x11F34,0x11F35,7), + (0x11F36,0x11F3A,5),(0x11F3E,0x11F3F,7),(0x11F40,0x11F42,5),(0x11F5A,0x11F5A,5), + (0x13430,0x1343F,4),(0x13440,0x13440,5),(0x13447,0x13455,5),(0x1611E,0x16129,5), + (0x1612A,0x1612C,7),(0x1612D,0x1612F,5),(0x16AF0,0x16AF4,5),(0x16B30,0x16B36,5), + (0x16D63,0x16D63,9),(0x16D67,0x16D6A,9),(0x16F4F,0x16F4F,5),(0x16F51,0x16F87,7), + (0x16F8F,0x16F92,5),(0x16FE4,0x16FE4,5),(0x16FF0,0x16FF1,5),(0x1BC9D,0x1BC9E,5), + (0x1BCA0,0x1BCA3,4),(0x1CF00,0x1CF2D,5),(0x1CF30,0x1CF46,5),(0x1D165,0x1D169,5), + (0x1D16D,0x1D172,5),(0x1D173,0x1D17A,4),(0x1D17B,0x1D182,5),(0x1D185,0x1D18B,5), + (0x1D1AA,0x1D1AD,5),(0x1D242,0x1D244,5),(0x1DA00,0x1DA36,5),(0x1DA3B,0x1DA6C,5), + (0x1DA75,0x1DA75,5),(0x1DA84,0x1DA84,5),(0x1DA9B,0x1DA9F,5),(0x1DAA1,0x1DAAF,5), + (0x1E000,0x1E006,5),(0x1E008,0x1E018,5),(0x1E01B,0x1E021,5),(0x1E023,0x1E024,5), + (0x1E026,0x1E02A,5),(0x1E08F,0x1E08F,5),(0x1E130,0x1E136,5),(0x1E2AE,0x1E2AE,5), + (0x1E2EC,0x1E2EF,5),(0x1E4EC,0x1E4EF,5),(0x1E5EE,0x1E5EF,5),(0x1E6E3,0x1E6E3,5), + (0x1E6E6,0x1E6E6,5),(0x1E6EE,0x1E6EF,5),(0x1E6F5,0x1E6F5,5),(0x1E8D0,0x1E8D6,5), + (0x1E944,0x1E94A,5),(0x1F1E6,0x1F1FF,6),(0x1F3FB,0x1F3FF,5),(0xE0000,0xE001F,4), + (0xE0020,0xE007F,5),(0xE0080,0xE00FF,4),(0xE0100,0xE01EF,5),(0xE01F0,0xE0FFF,4), +) + +# Extended_Pictographic ranges +EXT_PICT_RANGES = ( + (0x00A9,0x00A9),(0x00AE,0x00AE),(0x203C,0x203C),(0x2049,0x2049),(0x2122,0x2122), + (0x2139,0x2139),(0x2194,0x2199),(0x21A9,0x21AA),(0x231A,0x231B),(0x2328,0x2328), + (0x23CF,0x23CF),(0x23E9,0x23F3),(0x23F8,0x23FA),(0x24C2,0x24C2),(0x25AA,0x25AB), + (0x25B6,0x25B6),(0x25C0,0x25C0),(0x25FB,0x25FE),(0x2600,0x2604),(0x260E,0x260E), + (0x2611,0x2611),(0x2614,0x2615),(0x2618,0x2618),(0x261D,0x261D),(0x2620,0x2620), + (0x2622,0x2623),(0x2626,0x2626),(0x262A,0x262A),(0x262E,0x262F),(0x2638,0x263A), + (0x2640,0x2640),(0x2642,0x2642),(0x2648,0x2653),(0x265F,0x2660),(0x2663,0x2663), + (0x2665,0x2666),(0x2668,0x2668),(0x267B,0x267B),(0x267E,0x267F),(0x2692,0x2697), + (0x2699,0x2699),(0x269B,0x269C),(0x26A0,0x26A1),(0x26A7,0x26A7),(0x26AA,0x26AB), + (0x26B0,0x26B1),(0x26BD,0x26BE),(0x26C4,0x26C5),(0x26C8,0x26C8),(0x26CE,0x26CF), + (0x26D1,0x26D1),(0x26D3,0x26D4),(0x26E9,0x26EA),(0x26F0,0x26F5),(0x26F7,0x26FA), + (0x26FD,0x26FD),(0x2702,0x2702),(0x2705,0x2705),(0x2708,0x270D),(0x270F,0x270F), + (0x2712,0x2712),(0x2714,0x2714),(0x2716,0x2716),(0x271D,0x271D),(0x2721,0x2721), + (0x2728,0x2728),(0x2733,0x2734),(0x2744,0x2744),(0x2747,0x2747),(0x274C,0x274C), + (0x274E,0x274E),(0x2753,0x2755),(0x2757,0x2757),(0x2763,0x2764),(0x2795,0x2797), + (0x27A1,0x27A1),(0x27B0,0x27B0),(0x27BF,0x27BF),(0x2934,0x2935),(0x2B05,0x2B07), + (0x2B1B,0x2B1C),(0x2B50,0x2B50),(0x2B55,0x2B55),(0x3030,0x3030),(0x303D,0x303D), + (0x3297,0x3297),(0x3299,0x3299),(0x1F004,0x1F004),(0x1F02C,0x1F02F),(0x1F094,0x1F09F), + (0x1F0AF,0x1F0B0),(0x1F0C0,0x1F0C0),(0x1F0CF,0x1F0D0),(0x1F0F6,0x1F0FF),(0x1F170,0x1F171), + (0x1F17E,0x1F17F),(0x1F18E,0x1F18E),(0x1F191,0x1F19A),(0x1F1AE,0x1F1E5),(0x1F201,0x1F20F), + (0x1F21A,0x1F21A),(0x1F22F,0x1F22F),(0x1F232,0x1F23A),(0x1F23C,0x1F23F),(0x1F249,0x1F25F), + (0x1F266,0x1F321),(0x1F324,0x1F393),(0x1F396,0x1F397),(0x1F399,0x1F39B),(0x1F39E,0x1F3F0), + (0x1F3F3,0x1F3F5),(0x1F3F7,0x1F3FA),(0x1F400,0x1F4FD),(0x1F4FF,0x1F53D),(0x1F549,0x1F54E), + (0x1F550,0x1F567),(0x1F56F,0x1F570),(0x1F573,0x1F57A),(0x1F587,0x1F587),(0x1F58A,0x1F58D), + (0x1F590,0x1F590),(0x1F595,0x1F596),(0x1F5A4,0x1F5A5),(0x1F5A8,0x1F5A8),(0x1F5B1,0x1F5B2), + (0x1F5BC,0x1F5BC),(0x1F5C2,0x1F5C4),(0x1F5D1,0x1F5D3),(0x1F5DC,0x1F5DE),(0x1F5E1,0x1F5E1), + (0x1F5E3,0x1F5E3),(0x1F5E8,0x1F5E8),(0x1F5EF,0x1F5EF),(0x1F5F3,0x1F5F3),(0x1F5FA,0x1F64F), + (0x1F680,0x1F6C5),(0x1F6CB,0x1F6D2),(0x1F6D5,0x1F6E5),(0x1F6E9,0x1F6E9),(0x1F6EB,0x1F6F0), + (0x1F6F3,0x1F6FF),(0x1F7DA,0x1F7FF),(0x1F80C,0x1F80F),(0x1F848,0x1F84F),(0x1F85A,0x1F85F), + (0x1F888,0x1F88F),(0x1F8AE,0x1F8AF),(0x1F8BC,0x1F8BF),(0x1F8C2,0x1F8CF),(0x1F8D9,0x1F8FF), + (0x1F90C,0x1F93A),(0x1F93C,0x1F945),(0x1F947,0x1F9FF),(0x1FA58,0x1FA5F),(0x1FA6E,0x1FAFF), + (0x1FC00,0x1FFFD), +) + +# Indic_Conjunct_Break ranges +INCB_RANGES = ( + (0x0300,0x036F,3),(0x0483,0x0489,3),(0x0591,0x05BD,3),(0x05BF,0x05BF,3), + (0x05C1,0x05C2,3),(0x05C4,0x05C5,3),(0x05C7,0x05C7,3),(0x0610,0x061A,3), + (0x064B,0x065F,3),(0x0670,0x0670,3),(0x06D6,0x06DC,3),(0x06DF,0x06E4,3), + (0x06E7,0x06E8,3),(0x06EA,0x06ED,3),(0x0711,0x0711,3),(0x0730,0x074A,3), + (0x07A6,0x07B0,3),(0x07EB,0x07F3,3),(0x07FD,0x07FD,3),(0x0816,0x0819,3), + (0x081B,0x0823,3),(0x0825,0x0827,3),(0x0829,0x082D,3),(0x0859,0x085B,3), + (0x0897,0x089F,3),(0x08CA,0x08E1,3),(0x08E3,0x0902,3),(0x0915,0x0939,2), + (0x093A,0x093A,3),(0x093C,0x093C,3),(0x0941,0x0948,3),(0x094D,0x094D,1), + (0x0951,0x0957,3),(0x0958,0x095F,2),(0x0962,0x0963,3),(0x0978,0x097F,2), + (0x0981,0x0981,3),(0x0995,0x09A8,2),(0x09AA,0x09B0,2),(0x09B2,0x09B2,2), + (0x09B6,0x09B9,2),(0x09BC,0x09BC,3),(0x09BE,0x09BE,3),(0x09C1,0x09C4,3), + (0x09CD,0x09CD,1),(0x09D7,0x09D7,3),(0x09DC,0x09DD,2),(0x09DF,0x09DF,2), + (0x09E2,0x09E3,3),(0x09F0,0x09F1,2),(0x09FE,0x09FE,3),(0x0A01,0x0A02,3), + (0x0A3C,0x0A3C,3),(0x0A41,0x0A42,3),(0x0A47,0x0A48,3),(0x0A4B,0x0A4D,3), + (0x0A51,0x0A51,3),(0x0A70,0x0A71,3),(0x0A75,0x0A75,3),(0x0A81,0x0A82,3), + (0x0A95,0x0AA8,2),(0x0AAA,0x0AB0,2),(0x0AB2,0x0AB3,2),(0x0AB5,0x0AB9,2), + (0x0ABC,0x0ABC,3),(0x0AC1,0x0AC5,3),(0x0AC7,0x0AC8,3),(0x0ACD,0x0ACD,1), + (0x0AE2,0x0AE3,3),(0x0AF9,0x0AF9,2),(0x0AFA,0x0AFF,3),(0x0B01,0x0B01,3), + (0x0B15,0x0B28,2),(0x0B2A,0x0B30,2),(0x0B32,0x0B33,2),(0x0B35,0x0B39,2), + (0x0B3C,0x0B3C,3),(0x0B3E,0x0B3F,3),(0x0B41,0x0B44,3),(0x0B4D,0x0B4D,1), + (0x0B55,0x0B57,3),(0x0B5C,0x0B5D,2),(0x0B5F,0x0B5F,2),(0x0B62,0x0B63,3), + (0x0B71,0x0B71,2),(0x0B82,0x0B82,3),(0x0BBE,0x0BBE,3),(0x0BC0,0x0BC0,3), + (0x0BCD,0x0BCD,3),(0x0BD7,0x0BD7,3),(0x0C00,0x0C00,3),(0x0C04,0x0C04,3), + (0x0C15,0x0C28,2),(0x0C2A,0x0C39,2),(0x0C3C,0x0C3C,3),(0x0C3E,0x0C40,3), + (0x0C46,0x0C48,3),(0x0C4A,0x0C4C,3),(0x0C4D,0x0C4D,1),(0x0C55,0x0C56,3), + (0x0C58,0x0C5A,2),(0x0C62,0x0C63,3),(0x0C81,0x0C81,3),(0x0CBC,0x0CBC,3), + (0x0CBF,0x0CC0,3),(0x0CC2,0x0CC2,3),(0x0CC6,0x0CC8,3),(0x0CCA,0x0CCD,3), + (0x0CD5,0x0CD6,3),(0x0CE2,0x0CE3,3),(0x0D00,0x0D01,3),(0x0D15,0x0D3A,2), + (0x0D3B,0x0D3C,3),(0x0D3E,0x0D3E,3),(0x0D41,0x0D44,3),(0x0D4D,0x0D4D,1), + (0x0D57,0x0D57,3),(0x0D62,0x0D63,3),(0x0D81,0x0D81,3),(0x0DCA,0x0DCA,3), + (0x0DCF,0x0DCF,3),(0x0DD2,0x0DD4,3),(0x0DD6,0x0DD6,3),(0x0DDF,0x0DDF,3), + (0x0E31,0x0E31,3),(0x0E34,0x0E3A,3),(0x0E47,0x0E4E,3),(0x0EB1,0x0EB1,3), + (0x0EB4,0x0EBC,3),(0x0EC8,0x0ECE,3),(0x0F18,0x0F19,3),(0x0F35,0x0F35,3), + (0x0F37,0x0F37,3),(0x0F39,0x0F39,3),(0x0F71,0x0F7E,3),(0x0F80,0x0F84,3), + (0x0F86,0x0F87,3),(0x0F8D,0x0F97,3),(0x0F99,0x0FBC,3),(0x0FC6,0x0FC6,3), + (0x1000,0x102A,2),(0x102D,0x1030,3),(0x1032,0x1037,3),(0x1039,0x1039,1), + (0x103A,0x103A,3),(0x103D,0x103E,3),(0x103F,0x103F,2),(0x1050,0x1055,2), + (0x1058,0x1059,3),(0x105A,0x105D,2),(0x105E,0x1060,3),(0x1061,0x1061,2), + (0x1065,0x1066,2),(0x106E,0x1070,2),(0x1071,0x1074,3),(0x1075,0x1081,2), + (0x1082,0x1082,3),(0x1085,0x1086,3),(0x108D,0x108D,3),(0x108E,0x108E,2), + (0x109D,0x109D,3),(0x135D,0x135F,3),(0x1712,0x1715,3),(0x1732,0x1734,3), + (0x1752,0x1753,3),(0x1772,0x1773,3),(0x1780,0x17B3,2),(0x17B4,0x17B5,3), + (0x17B7,0x17BD,3),(0x17C6,0x17C6,3),(0x17C9,0x17D1,3),(0x17D2,0x17D2,1), + (0x17D3,0x17D3,3),(0x17DD,0x17DD,3),(0x180B,0x180D,3),(0x180F,0x180F,3), + (0x1885,0x1886,3),(0x18A9,0x18A9,3),(0x1920,0x1922,3),(0x1927,0x1928,3), + (0x1932,0x1932,3),(0x1939,0x193B,3),(0x1A17,0x1A18,3),(0x1A1B,0x1A1B,3), + (0x1A20,0x1A54,2),(0x1A56,0x1A56,3),(0x1A58,0x1A5E,3),(0x1A60,0x1A60,1), + (0x1A62,0x1A62,3),(0x1A65,0x1A6C,3),(0x1A73,0x1A7C,3),(0x1A7F,0x1A7F,3), + (0x1AB0,0x1ADD,3),(0x1AE0,0x1AEB,3),(0x1B00,0x1B03,3),(0x1B0B,0x1B0C,2), + (0x1B13,0x1B33,2),(0x1B34,0x1B3D,3),(0x1B42,0x1B43,3),(0x1B44,0x1B44,1), + (0x1B45,0x1B4C,2),(0x1B6B,0x1B73,3),(0x1B80,0x1B81,3),(0x1B83,0x1BA0,2), + (0x1BA2,0x1BA5,3),(0x1BA8,0x1BAA,3),(0x1BAB,0x1BAB,1),(0x1BAC,0x1BAD,3), + (0x1BAE,0x1BAF,2),(0x1BBB,0x1BBD,2),(0x1BE6,0x1BE6,3),(0x1BE8,0x1BE9,3), + (0x1BED,0x1BED,3),(0x1BEF,0x1BF3,3),(0x1C2C,0x1C33,3),(0x1C36,0x1C37,3), + (0x1CD0,0x1CD2,3),(0x1CD4,0x1CE0,3),(0x1CE2,0x1CE8,3),(0x1CED,0x1CED,3), + (0x1CF4,0x1CF4,3),(0x1CF8,0x1CF9,3),(0x1DC0,0x1DFF,3),(0x200D,0x200D,3), + (0x20D0,0x20F0,3),(0x2CEF,0x2CF1,3),(0x2D7F,0x2D7F,3),(0x2DE0,0x2DFF,3), + (0x302A,0x302F,3),(0x3099,0x309A,3),(0xA66F,0xA672,3),(0xA674,0xA67D,3), + (0xA69E,0xA69F,3),(0xA6F0,0xA6F1,3),(0xA802,0xA802,3),(0xA806,0xA806,3), + (0xA80B,0xA80B,3),(0xA825,0xA826,3),(0xA82C,0xA82C,3),(0xA8C4,0xA8C5,3), + (0xA8E0,0xA8F1,3),(0xA8FF,0xA8FF,3),(0xA926,0xA92D,3),(0xA947,0xA951,3), + (0xA953,0xA953,3),(0xA980,0xA982,3),(0xA989,0xA98B,2),(0xA98F,0xA9B2,2), + (0xA9B3,0xA9B3,3),(0xA9B6,0xA9B9,3),(0xA9BC,0xA9BD,3),(0xA9C0,0xA9C0,1), + (0xA9E0,0xA9E4,2),(0xA9E5,0xA9E5,3),(0xA9E7,0xA9EF,2),(0xA9FA,0xA9FE,2), + (0xAA29,0xAA2E,3),(0xAA31,0xAA32,3),(0xAA35,0xAA36,3),(0xAA43,0xAA43,3), + (0xAA4C,0xAA4C,3),(0xAA60,0xAA6F,2),(0xAA71,0xAA73,2),(0xAA7A,0xAA7A,2), + (0xAA7C,0xAA7C,3),(0xAA7E,0xAA7F,2),(0xAAB0,0xAAB0,3),(0xAAB2,0xAAB4,3), + (0xAAB7,0xAAB8,3),(0xAABE,0xAABF,3),(0xAAC1,0xAAC1,3),(0xAAE0,0xAAEA,2), + (0xAAEC,0xAAED,3),(0xAAF6,0xAAF6,1),(0xABC0,0xABDA,2),(0xABE5,0xABE5,3), + (0xABE8,0xABE8,3),(0xABED,0xABED,3),(0xFB1E,0xFB1E,3),(0xFE00,0xFE0F,3), + (0xFE20,0xFE2F,3),(0xFF9E,0xFF9F,3),(0x101FD,0x101FD,3),(0x102E0,0x102E0,3), + (0x10376,0x1037A,3),(0x10A00,0x10A00,2),(0x10A01,0x10A03,3),(0x10A05,0x10A06,3), + (0x10A0C,0x10A0F,3),(0x10A10,0x10A13,2),(0x10A15,0x10A17,2),(0x10A19,0x10A35,2), + (0x10A38,0x10A3A,3),(0x10A3F,0x10A3F,1),(0x10AE5,0x10AE6,3),(0x10D24,0x10D27,3), + (0x10D69,0x10D6D,3),(0x10EAB,0x10EAC,3),(0x10EFA,0x10EFF,3),(0x10F46,0x10F50,3), + (0x10F82,0x10F85,3),(0x11001,0x11001,3),(0x11038,0x11046,3),(0x11070,0x11070,3), + (0x11073,0x11074,3),(0x1107F,0x11081,3),(0x110B3,0x110B6,3),(0x110B9,0x110BA,3), + (0x110C2,0x110C2,3),(0x11100,0x11102,3),(0x11103,0x11126,2),(0x11127,0x1112B,3), + (0x1112D,0x11132,3),(0x11133,0x11133,1),(0x11134,0x11134,3),(0x11144,0x11144,2), + (0x11147,0x11147,2),(0x11173,0x11173,3),(0x11180,0x11181,3),(0x111B6,0x111BE,3), + (0x111C0,0x111C0,3),(0x111C9,0x111CC,3),(0x111CF,0x111CF,3),(0x1122F,0x11231,3), + (0x11234,0x11237,3),(0x1123E,0x1123E,3),(0x11241,0x11241,3),(0x112DF,0x112DF,3), + (0x112E3,0x112EA,3),(0x11300,0x11301,3),(0x1133B,0x1133C,3),(0x1133E,0x1133E,3), + (0x11340,0x11340,3),(0x1134D,0x1134D,3),(0x11357,0x11357,3),(0x11366,0x1136C,3), + (0x11370,0x11374,3),(0x11380,0x11389,2),(0x1138B,0x1138B,2),(0x1138E,0x1138E,2), + (0x11390,0x113B5,2),(0x113B8,0x113B8,3),(0x113BB,0x113C0,3),(0x113C2,0x113C2,3), + (0x113C5,0x113C5,3),(0x113C7,0x113C9,3),(0x113CE,0x113CF,3),(0x113D0,0x113D0,1), + (0x113D2,0x113D2,3),(0x113E1,0x113E2,3),(0x11438,0x1143F,3),(0x11442,0x11444,3), + (0x11446,0x11446,3),(0x1145E,0x1145E,3),(0x114B0,0x114B0,3),(0x114B3,0x114B8,3), + (0x114BA,0x114BA,3),(0x114BD,0x114BD,3),(0x114BF,0x114C0,3),(0x114C2,0x114C3,3), + (0x115AF,0x115AF,3),(0x115B2,0x115B5,3),(0x115BC,0x115BD,3),(0x115BF,0x115C0,3), + (0x115DC,0x115DD,3),(0x11633,0x1163A,3),(0x1163D,0x1163D,3),(0x1163F,0x11640,3), + (0x116AB,0x116AB,3),(0x116AD,0x116AD,3),(0x116B0,0x116B7,3),(0x1171D,0x1171D,3), + (0x1171F,0x1171F,3),(0x11722,0x11725,3),(0x11727,0x1172B,3),(0x1182F,0x11837,3), + (0x11839,0x1183A,3),(0x11900,0x11906,2),(0x11909,0x11909,2),(0x1190C,0x11913,2), + (0x11915,0x11916,2),(0x11918,0x1192F,2),(0x11930,0x11930,3),(0x1193B,0x1193D,3), + (0x1193E,0x1193E,1),(0x11943,0x11943,3),(0x119D4,0x119D7,3),(0x119DA,0x119DB,3), + (0x119E0,0x119E0,3),(0x11A00,0x11A00,2),(0x11A01,0x11A0A,3),(0x11A0B,0x11A32,2), + (0x11A33,0x11A38,3),(0x11A3B,0x11A3E,3),(0x11A47,0x11A47,1),(0x11A50,0x11A50,2), + (0x11A51,0x11A56,3),(0x11A59,0x11A5B,3),(0x11A5C,0x11A83,2),(0x11A8A,0x11A96,3), + (0x11A98,0x11A98,3),(0x11A99,0x11A99,1),(0x11B60,0x11B60,3),(0x11B62,0x11B64,3), + (0x11B66,0x11B66,3),(0x11C30,0x11C36,3),(0x11C38,0x11C3D,3),(0x11C3F,0x11C3F,3), + (0x11C92,0x11CA7,3),(0x11CAA,0x11CB0,3),(0x11CB2,0x11CB3,3),(0x11CB5,0x11CB6,3), + (0x11D31,0x11D36,3),(0x11D3A,0x11D3A,3),(0x11D3C,0x11D3D,3),(0x11D3F,0x11D45,3), + (0x11D47,0x11D47,3),(0x11D90,0x11D91,3),(0x11D95,0x11D95,3),(0x11D97,0x11D97,3), + (0x11EF3,0x11EF4,3),(0x11F00,0x11F01,3),(0x11F04,0x11F10,2),(0x11F12,0x11F33,2), + (0x11F36,0x11F3A,3),(0x11F40,0x11F41,3),(0x11F42,0x11F42,1),(0x11F5A,0x11F5A,3), + (0x13440,0x13440,3),(0x13447,0x13455,3),(0x1611E,0x16129,3),(0x1612D,0x1612F,3), + (0x16AF0,0x16AF4,3),(0x16B30,0x16B36,3),(0x16F4F,0x16F4F,3),(0x16F8F,0x16F92,3), + (0x16FE4,0x16FE4,3),(0x16FF0,0x16FF1,3),(0x1BC9D,0x1BC9E,3),(0x1CF00,0x1CF2D,3), + (0x1CF30,0x1CF46,3),(0x1D165,0x1D169,3),(0x1D16D,0x1D172,3),(0x1D17B,0x1D182,3), + (0x1D185,0x1D18B,3),(0x1D1AA,0x1D1AD,3),(0x1D242,0x1D244,3),(0x1DA00,0x1DA36,3), + (0x1DA3B,0x1DA6C,3),(0x1DA75,0x1DA75,3),(0x1DA84,0x1DA84,3),(0x1DA9B,0x1DA9F,3), + (0x1DAA1,0x1DAAF,3),(0x1E000,0x1E006,3),(0x1E008,0x1E018,3),(0x1E01B,0x1E021,3), + (0x1E023,0x1E024,3),(0x1E026,0x1E02A,3),(0x1E08F,0x1E08F,3),(0x1E130,0x1E136,3), + (0x1E2AE,0x1E2AE,3),(0x1E2EC,0x1E2EF,3),(0x1E4EC,0x1E4EF,3),(0x1E5EE,0x1E5EF,3), + (0x1E6E3,0x1E6E3,3),(0x1E6E6,0x1E6E6,3),(0x1E6EE,0x1E6EF,3),(0x1E6F5,0x1E6F5,3), + (0x1E8D0,0x1E8D6,3),(0x1E944,0x1E94A,3),(0x1F3FB,0x1F3FF,3),(0xE0020,0xE007F,3), + (0xE0100,0xE01EF,3), +) + diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h index e5f73e638f019f..d6ca5acd73d63f 100644 --- a/Python/stdlib_module_names.h +++ b/Python/stdlib_module_names.h @@ -65,6 +65,7 @@ static const char* _Py_stdlib_module_names[] = { "_posixsubprocess", "_py_abc", "_py_grapheme", +"_py_grapheme_db", "_py_warnings", "_pydatetime", "_pydecimal", diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 5db850ca2d1f0c..887b88d1038872 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -166,6 +166,7 @@ def maketables(trace=0): makeunicodename(unicode, trace) makeunicodedata(unicode, trace) makeunicodetype(unicode, trace) + makegraphemedata(unicode, trace) # -------------------------------------------------------------------- @@ -704,6 +705,133 @@ def makeunicodetype(unicode, trace): fprint() +# -------------------------------------------------------------------- +# grapheme cluster break data for pure Python implementation + +def makegraphemedata(unicode, trace): + + FILE = "Lib/_py_grapheme_db.py" + + print("--- Preparing", FILE, "...") + + gcb_names = GRAPHEME_CLUSTER_NAMES + incb_names = INDIC_CONJUNCT_BREAK_NAMES + + # Build per-codepoint GCB array, excluding LV/LVT (computed algorithmically) + gcb_values = [] + for char in unicode.chars: + gcb_name = unicode.grapheme_breaks[char] or 'Other' + gcb_values.append(gcb_names.index(gcb_name)) + + # Compress into ranges: (start, end, value), skipping LV/LVT and Other + gcb_lv = gcb_names.index('LV') + gcb_lvt = gcb_names.index('LVT') + gcb_other = gcb_names.index('Other') + + gcb_ranges = _compress_ranges( + gcb_values, skip_values={gcb_lv, gcb_lvt, gcb_other}, + ) + + # Build per-codepoint ExtPict array + ep_ranges = _compress_ranges( + [int(v) for v in unicode.ext_picts], skip_values={0}, + ) + + # Build per-codepoint InCB array + incb_values = [] + for char in unicode.chars: + record = unicode.table[char] + if record: + incb_values.append(incb_names.index(record.incb)) + else: + incb_values.append(0) # None + + incb_none = incb_names.index('None') + incb_ranges = _compress_ranges( + incb_values, skip_values={incb_none}, + ) + + print(len(gcb_ranges), "GCB ranges") + print(len(ep_ranges), "Extended_Pictographic ranges") + print(len(incb_ranges), "InCB ranges") + + print("--- Writing", FILE, "...") + + with open(FILE, "w") as fp: + fprint = partial(print, file=fp) + + fprint("# This file was generated by %s %s." % (SCRIPT, VERSION)) + fprint("# Unicode version: %s." % UNIDATA_VERSION) + fprint("#") + fprint("# Property tables for the pure Python grapheme cluster break") + fprint("# implementation in _py_grapheme.py.") + fprint("#") + fprint("# Each table is a tuple of (start, end, value) ranges sorted") + fprint("# by start codepoint. The default value for codepoints not") + fprint("# covered is 0.") + fprint() + + fprint("UNIDATA_VERSION = %r" % UNIDATA_VERSION) + fprint() + + # GCB constants + fprint("# Grapheme_Cluster_Break values") + for i, name in enumerate(gcb_names): + fprint("GCB_%s = %d" % (name, i)) + fprint() + + # InCB constants + fprint("# Indic_Conjunct_Break values") + for i, name in enumerate(incb_names): + fprint("InCB_%s = %d" % (name, i)) + fprint() + + # GCB ranges + fprint("# Grapheme_Cluster_Break ranges") + fprint("# (excludes LV/LVT, computed via Hangul syllable arithmetic)") + _write_ranges(fprint, "GCB_RANGES", gcb_ranges) + + # ExtPict ranges + fprint("# Extended_Pictographic ranges") + _write_ranges(fprint, "EXT_PICT_RANGES", ep_ranges, has_value=False) + + # InCB ranges + fprint("# Indic_Conjunct_Break ranges") + _write_ranges(fprint, "INCB_RANGES", incb_ranges) + + +def _compress_ranges(values, skip_values): + """Compress a per-codepoint value array into sorted (start, end, value) ranges.""" + ranges = [] + n = len(values) + i = 0 + while i < n: + v = values[i] + if v in skip_values: + i += 1 + continue + start = i + while i < n and values[i] == v: + i += 1 + ranges.append((start, i - 1, v)) + return ranges + + +def _write_ranges(fprint, name, ranges, has_value=True): + """Write a range table as a Python tuple of tuples.""" + fprint("%s = (" % name) + per_line = 4 if has_value else 5 + for i in range(0, len(ranges), per_line): + chunk = ranges[i:i + per_line] + if has_value: + parts = ["(0x%04X,0x%04X,%d)" % (s, e, v) for s, e, v in chunk] + else: + parts = ["(0x%04X,0x%04X)" % (s, e) for s, e, _v in chunk] + fprint(" %s," % ",".join(parts)) + fprint(")") + fprint() + + # -------------------------------------------------------------------- # unicode name database From 5701c0bb2d83f7eec642a71e996414d7e44bcf07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 7 Apr 2026 17:08:25 +0200 Subject: [PATCH 5/6] Fix newlines to make linter happy --- Lib/_py_grapheme_db.py | 1 - Tools/unicode/makeunicodedata.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/_py_grapheme_db.py b/Lib/_py_grapheme_db.py index 23a5c726b91530..f2130a70dcbb64 100644 --- a/Lib/_py_grapheme_db.py +++ b/Lib/_py_grapheme_db.py @@ -342,4 +342,3 @@ (0x1E8D0,0x1E8D6,3),(0x1E944,0x1E94A,3),(0x1F3FB,0x1F3FF,3),(0xE0020,0xE007F,3), (0xE0100,0xE01EF,3), ) - diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 887b88d1038872..0370835af80350 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -790,10 +790,12 @@ def makegraphemedata(unicode, trace): fprint("# Grapheme_Cluster_Break ranges") fprint("# (excludes LV/LVT, computed via Hangul syllable arithmetic)") _write_ranges(fprint, "GCB_RANGES", gcb_ranges) + fprint() # ExtPict ranges fprint("# Extended_Pictographic ranges") _write_ranges(fprint, "EXT_PICT_RANGES", ep_ranges, has_value=False) + fprint() # InCB ranges fprint("# Indic_Conjunct_Break ranges") @@ -829,7 +831,6 @@ def _write_ranges(fprint, name, ranges, has_value=True): parts = ["(0x%04X,0x%04X)" % (s, e) for s, e, _v in chunk] fprint(" %s," % ",".join(parts)) fprint(")") - fprint() # -------------------------------------------------------------------- From e073e06a18ed1f3409a77da8232bc931815f2b13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Tue, 7 Apr 2026 17:33:05 +0200 Subject: [PATCH 6/6] Achieve 100% statement and branch test coverage --- Lib/_py_grapheme.py | 6 ++---- Lib/test/test_unicodedata.py | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/Lib/_py_grapheme.py b/Lib/_py_grapheme.py index 9cc6194788e5a2..b55b42994b7cb3 100644 --- a/Lib/_py_grapheme.py +++ b/Lib/_py_grapheme.py @@ -67,10 +67,8 @@ def __repr__(self): def _get_gcb(cp): """Return the Grapheme_Cluster_Break value for a codepoint.""" idx = bisect_right(_GCB_STARTS, cp) - 1 - if idx >= 0: - entry = GCB_RANGES[idx] - if cp <= entry[1]: - return entry[2] + if idx >= 0 and cp <= GCB_RANGES[idx][1]: + return GCB_RANGES[idx][2] # Hangul syllables: LV if (cp - S_BASE) % T_COUNT == 0, else LVT if _HANGUL_S_BASE <= cp < _HANGUL_S_BASE + _HANGUL_S_COUNT: if (cp - _HANGUL_S_BASE) % _HANGUL_T_COUNT == 0: diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 330f1be0dcde15..8f37a7bafb8997 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -1182,17 +1182,6 @@ def test_linebreak_7643(self): self.assertEqual(len(lines), 1, r"%a should not be a linebreak" % c) - def test_segment_object(self): - segments = list(unicodedata.iter_graphemes('spa\u0300m')) - self.assertEqual(len(segments), 4, segments) - segment = segments[2] - self.assertEqual(segment.start, 2) - self.assertEqual(segment.end, 4) - self.assertEqual(str(segment), 'a\u0300') - self.assertEqual(repr(segment), '') - self.assertRaises(TypeError, iter, segment) - self.assertRaises(TypeError, len, segment) - class NormalizationTest(unittest.TestCase): @staticmethod @@ -1384,6 +1373,18 @@ def test_segment_object(self): self.assertEqual(segment.start, 2) self.assertEqual(segment.end, 4) self.assertEqual(str(segment), 'a\u0300') + self.assertEqual(repr(segment), '') + self.assertRaises(TypeError, iter, segment) + self.assertRaises(TypeError, len, segment) + + def test_grapheme_break_fsm_edges(self): + graphemes = self._graphemes + # ExtPict followed by non-Extend/non-ZWJ resets the EP FSM + self.assertEqual(graphemes('\u2764b'), ['\u2764', 'b']) + # Consonant followed by InCB Extend (not Linker) stays in Started + self.assertEqual(graphemes('\u0915\u0951'), ['\u0915\u0951']) + # Consonant followed by InCB None resets InCB FSM + self.assertEqual(graphemes('\u0915b'), ['\u0915', 'b']) def _graphemes(self, *args): return list(map(str, self.iter_graphemes(*args)))