Skip to content

Commit f2b61fc

Browse files
authored
fix: edit pyarrow stringify to better handle emojis and accents (apache#22881)
1 parent c839d0d commit f2b61fc

File tree

5 files changed

+97
-12
lines changed

5 files changed

+97
-12
lines changed

superset/result_set.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,12 @@ def stringify_values(array: NDArray[Any]) -> NDArray[Any]:
7272
# pandas <NA> type cannot be converted to string
7373
obj[na_obj] = None # type: ignore
7474
else:
75-
obj[...] = stringify(obj) # type: ignore
75+
try:
76+
# for simple string conversions
77+
# this handles odd character types better
78+
obj[...] = obj.astype(str) # type: ignore
79+
except ValueError:
80+
obj[...] = stringify(obj) # type: ignore
7681

7782
return result
7883

superset/utils/pandas_postprocessing/boxplot.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ def boxplot(
5757
"""
5858

5959
def quartile1(series: Series) -> float:
60-
return np.nanpercentile(series, 25, interpolation="midpoint") # type: ignore
60+
return np.nanpercentile(series, 25, method="midpoint")
6161

6262
def quartile3(series: Series) -> float:
63-
return np.nanpercentile(series, 75, interpolation="midpoint") # type: ignore
63+
return np.nanpercentile(series, 75, method="midpoint")
6464

6565
if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:
6666

tests/integration_tests/result_set_tests.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,13 +169,13 @@ def test_nested_types(self):
169169
"id": 4,
170170
"dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]',
171171
"num_arr": "[1, 2, 3]",
172-
"map_col": '{"chart_name": "scatter"}',
172+
"map_col": "{'chart_name': 'scatter'}",
173173
},
174174
{
175175
"id": 3,
176176
"dict_arr": '[{"table_name": "birth_names", "database_id": 1}]',
177177
"num_arr": "[4, 5, 6]",
178-
"map_col": '{"chart_name": "plot"}',
178+
"map_col": "{'chart_name': 'plot'}",
179179
},
180180
],
181181
)

tests/unit_tests/dataframe_test.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,87 @@ def test_df_to_records_NaT_type() -> None:
5555

5656
assert df_to_records(df) == [
5757
{"date": None},
58-
{"date": '"2023-01-06T20:50:31.749000+00:00"'},
58+
{"date": "2023-01-06 20:50:31.749000+00:00"},
59+
]
60+
61+
62+
def test_df_to_records_mixed_emoji_type() -> None:
63+
from superset.db_engine_specs import BaseEngineSpec
64+
from superset.result_set import SupersetResultSet
65+
66+
data = [
67+
("What's up?", "This is a string text", 1),
68+
("What's up?", "This is a string with an 😍 added", 2),
69+
("What's up?", NaT, 3),
70+
("What's up?", "Last emoji 😁", 4),
71+
]
72+
73+
cursor_descr: DbapiDescription = [
74+
("question", "varchar", None, None, None, None, False),
75+
("response", "varchar", None, None, None, None, False),
76+
("count", "integer", None, None, None, None, False),
77+
]
78+
79+
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
80+
df = results.to_pandas_df()
81+
82+
assert df_to_records(df) == [
83+
{"question": "What's up?", "response": "This is a string text", "count": 1},
84+
{
85+
"question": "What's up?",
86+
"response": "This is a string with an 😍 added",
87+
"count": 2,
88+
},
89+
{
90+
"question": "What's up?",
91+
"response": None,
92+
"count": 3,
93+
},
94+
{
95+
"question": "What's up?",
96+
"response": "Last emoji 😁",
97+
"count": 4,
98+
},
99+
]
100+
101+
102+
def test_df_to_records_mixed_accent_type() -> None:
103+
from superset.db_engine_specs import BaseEngineSpec
104+
from superset.result_set import SupersetResultSet
105+
106+
data = [
107+
("What's up?", "This is a string text", 1),
108+
("What's up?", "This is a string with áccent", 2),
109+
("What's up?", NaT, 3),
110+
("What's up?", "móre áccent", 4),
111+
]
112+
113+
cursor_descr: DbapiDescription = [
114+
("question", "varchar", None, None, None, None, False),
115+
("response", "varchar", None, None, None, None, False),
116+
("count", "integer", None, None, None, None, False),
117+
]
118+
119+
results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
120+
df = results.to_pandas_df()
121+
122+
assert df_to_records(df) == [
123+
{"question": "What's up?", "response": "This is a string text", "count": 1},
124+
{
125+
"question": "What's up?",
126+
"response": "This is a string with áccent",
127+
"count": 2,
128+
},
129+
{
130+
"question": "What's up?",
131+
"response": None,
132+
"count": 3,
133+
},
134+
{
135+
"question": "What's up?",
136+
"response": "móre áccent",
137+
"count": 4,
138+
},
59139
]
60140

61141

tests/unit_tests/result_set_test.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,10 @@ def test_stringify_with_null_integers():
9898

9999
expected = np.array(
100100
[
101-
array(['"foo"', '"foo"', '"foo"'], dtype=object),
102-
array(['"bar"', '"bar"', '"bar"'], dtype=object),
101+
array(["foo", "foo", "foo"], dtype=object),
102+
array(["bar", "bar", "bar"], dtype=object),
103103
array([None, None, None], dtype=object),
104-
array([None, "true", None], dtype=object),
104+
array([None, "True", None], dtype=object),
105105
]
106106
)
107107

@@ -132,10 +132,10 @@ def test_stringify_with_null_timestamps():
132132

133133
expected = np.array(
134134
[
135-
array(['"foo"', '"foo"', '"foo"'], dtype=object),
136-
array(['"bar"', '"bar"', '"bar"'], dtype=object),
135+
array(["foo", "foo", "foo"], dtype=object),
136+
array(["bar", "bar", "bar"], dtype=object),
137137
array([None, None, None], dtype=object),
138-
array([None, "true", None], dtype=object),
138+
array([None, "True", None], dtype=object),
139139
]
140140
)
141141

0 commit comments

Comments
 (0)