Skip to content

Commit 442a97b

Browse files
committed
adds test case where payload data contains PUA unicode characters
1 parent 9410bc4 commit 442a97b

File tree

1 file changed

+51
-0
lines changed

1 file changed

+51
-0
lines changed

tests/pipeline/test_pipeline.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,3 +1663,54 @@ def api_fetch(page_num):
16631663
load_info = pipeline.run(product())
16641664
assert_load_info(load_info)
16651665
assert pipeline.last_trace.last_normalize_info.row_counts["product"] == 12
1666+
1667+
1668+
def test_run_with_pua_payload() -> None:
1669+
# prepare some data and complete load with run
1670+
os.environ["COMPLETED_PROB"] = "1.0"
1671+
pipeline_name = "pipe_" + uniq_id()
1672+
p = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb")
1673+
print(pipeline_name)
1674+
from dlt.common.json import PUA_START, PUA_CHARACTER_MAX
1675+
1676+
def some_data():
1677+
yield from [
1678+
# text is only PUA
1679+
{"id": 1, "text": chr(PUA_START)},
1680+
{"id": 2, "text": chr(PUA_START - 1)},
1681+
{"id": 3, "text": chr(PUA_START + 1)},
1682+
{"id": 4, "text": chr(PUA_START + PUA_CHARACTER_MAX + 1)},
1683+
# PUA inside text
1684+
{"id": 5, "text": f"a{chr(PUA_START)}b"},
1685+
{"id": 6, "text": f"a{chr(PUA_START - 1)}b"},
1686+
{"id": 7, "text": f"a{chr(PUA_START + 1)}b"},
1687+
# text starts with PUA
1688+
{"id": 8, "text": f"{chr(PUA_START)}a"},
1689+
{"id": 9, "text": f"{chr(PUA_START - 1)}a"},
1690+
{"id": 10, "text": f"{chr(PUA_START + 1)}a"},
1691+
]
1692+
1693+
@dlt.source
1694+
def source():
1695+
return dlt.resource(some_data(), name="pua_data")
1696+
1697+
load_info = p.run(source())
1698+
assert p.last_trace.last_normalize_info.row_counts["pua_data"] == 11
1699+
1700+
with p.sql_client() as client:
1701+
rows = client.execute_sql("SELECT text FROM pua_data ORDER BY id")
1702+
1703+
values = [r[0] for r in rows]
1704+
assert values == [
1705+
"\uf026",
1706+
"\uf025",
1707+
"\uf027",
1708+
"\uf02f",
1709+
"a\uf026b",
1710+
"a\uf025b",
1711+
"a\uf027b",
1712+
"\uf026a",
1713+
"\uf025a",
1714+
"\uf027a",
1715+
]
1716+
assert len(load_info.loads_ids) == 1

0 commit comments

Comments
 (0)