Skip to content

maint: leftover bits after unicode 15 upgrade #201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions maint/GenerateCommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,32 @@
# DATA LISTS
# ---------------------------------------------------------------------------

# BIDI classes in the DerivedBidiClass.txt file, with comments.
# BIDI classes in the DerivedBidiClass.txt file, short and long identifiers.

bidi_classes = [
'AL', 'Arabic letter',
'AN', 'Arabic number',
'B', 'Paragraph separator',
'BN', 'Boundary neutral',
'CS', 'Common separator',
'EN', 'European number',
'ES', 'European separator',
'ET', 'European terminator',
'FSI', 'First strong isolate',
'L', 'Left to right',
'LRE', 'Left to right embedding',
'LRI', 'Left to right isolate',
'LRO', 'Left to right override',
'NSM', 'Non-spacing mark',
'ON', 'Other neutral',
'PDF', 'Pop directional format',
'PDI', 'Pop directional isolate',
'R', 'Right to left',
'RLE', 'Right to left embedding',
'RLI', 'Right to left isolate',
'RLO', 'Right to left override',
'S', 'Segment separator',
'WS', 'White space'
'AL', 'Arabic_Letter',
'AN', 'Arabic_Number',
'B', 'Paragraph_Separator',
'BN', 'Boundary_Neutral',
'CS', 'Common_Separator',
'EN', 'European_Number',
'ES', 'European_Separator',
'ET', 'European_Terminator',
'FSI', 'First_Strong_Isolate',
'L', 'Left_To_Right',
'LRE', 'Left_To_Right_Embedding',
'LRI', 'Left_To_Right_Isolate',
'LRO', 'Left_To_Right_Override',
'NSM', 'Nonspacing_Mark',
'ON', 'Other_Neutral',
'PDF', 'Pop_Directional_Format',
'PDI', 'Pop_Directional_Isolate',
'R', 'Right_To_Left',
'RLE', 'Right_To_Left_Embedding',
'RLI', 'Right_To_Left_Isolate',
'RLO', 'Right_To_Left_Override',
'S', 'Segment_Separator',
'WS', 'White_Space'
]

# Particular category property names, with comments. NOTE: If ever this list
Expand Down
31 changes: 21 additions & 10 deletions maint/GenerateUcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,19 +271,27 @@
# ---------------------------------------------------------------------------


# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
# or DerivedGeneralCategory.txt
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt

def make_get_names(enum):
return lambda chardata: enum.index(chardata[1])


# Parse a line of DerivedBidiClass.txt

def get_bidi(chardata):
if len(chardata[1]) > 3:
return bidi_classes_long.index(chardata[1])
else:
return bidi_classes_short.index(chardata[1])


# Parse a line of CaseFolding.txt

def get_other_case(chardata):
if chardata[1] == 'C' or chardata[1] == 'S':
return int(chardata[2], 16) - int(chardata[0], 16)
return 0
return None


# Parse a line of ScriptExtensions.txt
Expand Down Expand Up @@ -318,23 +326,25 @@ def read_table(file_name, get_value, default_value):

table = [default_value] * MAX_UNICODE
for line in file:
if file_base == 'DerivedBidiClass':
line = re.sub(r'# @missing: ', '', line)

line = re.sub(r'#.*', '', line)
chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1:
continue
value = get_value(chardata)
if value is None:
continue
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
char = int(m.group(1), 16)
if m.group(3) is None:
last = char
else:
last = int(m.group(3), 16)
for i in range(char, last + 1):
# It is important not to overwrite a previously set value because in the
# CaseFolding file there are lines to be ignored (returning the default
# value of 0) which often come after a line which has already set data.
if table[i] == default_value:
table[i] = value
table[i] = value

file.close()
return table

Expand Down Expand Up @@ -508,7 +518,8 @@ def write_bitsets(list, item_size):
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
# remove them.

bidi_classes = bidi_classes[::2]
bidi_classes_short = bidi_classes[::2]
bidi_classes_long = bidi_classes[1::2]
break_properties = break_properties[::2]
category_names = category_names[::2]

Expand All @@ -518,7 +529,7 @@ def write_bitsets(list, item_size):
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', get_bidi, bidi_classes_short.index('L'))

# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
# we need to find the Extended_Pictographic property for emoji characters. This
Expand Down
Loading