From a689bbd3309597bde150639cfe1fe5014a7f68f8 Mon Sep 17 00:00:00 2001 From: Corvin McPherson Date: Tue, 29 Aug 2023 20:52:56 +0000 Subject: [PATCH 1/8] gh-108590: Fix sqlite3.iterdump for invalid unicode in text columns. --- Lib/sqlite3/dump.py | 16 +++++++++++++--- Lib/test/test_sqlite3/test_dump.py | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Lib/sqlite3/dump.py b/Lib/sqlite3/dump.py index ead3360ce67608..1a83d29041a5e4 100644 --- a/Lib/sqlite3/dump.py +++ b/Lib/sqlite3/dump.py @@ -14,6 +14,11 @@ def _quote_name(name): def _quote_value(value): return "'{0}'".format(value.replace("'", "''")) +def _force_decode(bs, *args, **kwargs): + try: + return bs.decode(*args, **kwargs) + except UnicodeDecodeError: + return "".join(chr(c) for c in bs) def _iterdump(connection): """ @@ -73,9 +78,14 @@ def _iterdump(connection): "||quote({0})||".format(_quote_name(col)) for col in column_names ) ) - query_res = cu.execute(q) - for row in query_res: - yield("{0};".format(row[0])) + orig_text_factory = connection.text_factory + try: + connection.text_factory = bytes + query_res = cu.execute(q) + for row in query_res: + yield("{0};".format(_force_decode(row[0]))) + finally: + connection.text_factory = orig_text_factory # Now when the type is 'index', 'trigger', or 'view' q = """ diff --git a/Lib/test/test_sqlite3/test_dump.py b/Lib/test/test_sqlite3/test_dump.py index 3107e1b165d950..8f56d1cb55fb15 100644 --- a/Lib/test/test_sqlite3/test_dump.py +++ b/Lib/test/test_sqlite3/test_dump.py @@ -133,6 +133,20 @@ def test_dump_virtual_tables(self): actual = list(self.cx.iterdump()) self.assertEqual(expected, actual) + def test_dump_unicode_invalid(self): + expected = [ + "BEGIN TRANSACTION;", + "CREATE TABLE foo (data TEXT);", + "INSERT INTO \"foo\" VALUES('a\x9f');", + "COMMIT;", + ] + self.cu.executescript(""" + CREATE TABLE foo (data TEXT); + INSERT INTO foo VALUES (CAST(X'619f' AS TEXT)); + """) + actual = list(self.cx.iterdump()) + self.assertEqual(expected, actual) + if __name__ == "__main__": unittest.main() From 03f41dfd65463faf13763cc171a27c2500cb80ca Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 29 Aug 2023 22:53:49 +0000 Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst diff --git a/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst b/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst new file mode 100644 index 00000000000000..03e9f0333b865f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst @@ -0,0 +1 @@ +Fixed an issue causing sqlite3.iterdump to crash when encountering invalid unicode in a TEXT column. From e2939d58c5e75cd5e997a37337ea38ea0f8bb23d Mon Sep 17 00:00:00 2001 From: Corvin Date: Wed, 30 Aug 2023 03:41:11 -0400 Subject: [PATCH 3/8] Update Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst Co-authored-by: Erlend E. Aasland --- .../next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst b/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst index 03e9f0333b865f..bdd1001be44797 100644 --- a/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst +++ b/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst @@ -1 +1 @@ -Fixed an issue causing sqlite3.iterdump to crash when encountering invalid unicode in a TEXT column. +Fixed an issue where :meth:`sqlite3.Connection.iterdump` would fail and leave an incomplete SQL dump if a table includes invalid Unicode sequences. From 0643ffcf07fa0381690843cc60fb975fe5912c09 Mon Sep 17 00:00:00 2001 From: Corvin Date: Wed, 30 Aug 2023 03:41:30 -0400 Subject: [PATCH 4/8] Update Lib/sqlite3/dump.py Co-authored-by: Erlend E. Aasland --- Lib/sqlite3/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/sqlite3/dump.py b/Lib/sqlite3/dump.py index 1a83d29041a5e4..8a5086637179c0 100644 --- a/Lib/sqlite3/dump.py +++ b/Lib/sqlite3/dump.py @@ -18,7 +18,7 @@ def _force_decode(bs, *args, **kwargs): try: return bs.decode(*args, **kwargs) except UnicodeDecodeError: - return "".join(chr(c) for c in bs) + return "".join([chr(c) for c in bs]) def _iterdump(connection): """ From 608d919240456b4dbde3b120afdc7b2494da04d2 Mon Sep 17 00:00:00 2001 From: Corvin McPherson Date: Wed, 30 Aug 2023 07:53:19 +0000 Subject: [PATCH 5/8] Address comments --- Lib/sqlite3/dump.py | 21 +++++++++++++------ Lib/test/test_sqlite3/test_dump.py | 1 + ...-08-29-22-53-48.gh-issue-108590.6k0pOl.rst | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/Lib/sqlite3/dump.py b/Lib/sqlite3/dump.py index 8a5086637179c0..1e0ff3915792c1 100644 --- a/Lib/sqlite3/dump.py +++ b/Lib/sqlite3/dump.py @@ -20,6 +20,19 @@ def _force_decode(bs, *args, **kwargs): except UnicodeDecodeError: return "".join([chr(c) for c in bs]) +class _ctx_text_factory: + def __init__(self, connection, text_factory): + self.connection = connection + self.text_factory = text_factory + self.orig_text_factory = None + + def __enter__(self): + self.orig_text_factory = self.connection.text_factory + self.connection.text_factory = self.text_factory + + def __exit__(self, type, value, tb): + self.connection.text_factory = self.orig_text_factory + def _iterdump(connection): """ Returns an iterator to the dump of the database in an SQL text format. @@ -78,14 +91,10 @@ def _iterdump(connection): "||quote({0})||".format(_quote_name(col)) for col in column_names ) ) - orig_text_factory = connection.text_factory - try: - connection.text_factory = bytes - query_res = cu.execute(q) + query_res = cu.execute(q) + with _ctx_text_factory(connection, bytes): for row in query_res: yield("{0};".format(_force_decode(row[0]))) - finally: - connection.text_factory = orig_text_factory # Now when the type is 'index', 'trigger', or 'view' q = """ diff --git a/Lib/test/test_sqlite3/test_dump.py b/Lib/test/test_sqlite3/test_dump.py index 8f56d1cb55fb15..0279ce68eeb5f1 100644 --- a/Lib/test/test_sqlite3/test_dump.py +++ b/Lib/test/test_sqlite3/test_dump.py @@ -134,6 +134,7 @@ def test_dump_virtual_tables(self): self.assertEqual(expected, actual) def test_dump_unicode_invalid(self): + # gh-108590 expected = [ "BEGIN TRANSACTION;", "CREATE TABLE foo (data TEXT);", diff --git a/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst b/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst index bdd1001be44797..50b41f2a94d9be 100644 --- a/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst +++ b/Misc/NEWS.d/next/Library/2023-08-29-22-53-48.gh-issue-108590.6k0pOl.rst @@ -1 +1 @@ -Fixed an issue where :meth:`sqlite3.Connection.iterdump` would fail and leave an incomplete SQL dump if a table includes invalid Unicode sequences. +Fixed an issue where :meth:`sqlite3.Connection.iterdump` would fail and leave an incomplete SQL dump if a table includes invalid Unicode sequences. Patch by Corvin McPherson From cc7fba8aa17e7a7cece54f87111d3f56967a43f0 Mon Sep 17 00:00:00 2001 From: Corvin McPherson Date: Wed, 30 Aug 2023 08:11:37 +0000 Subject: [PATCH 6/8] Use contextmanager --- Lib/sqlite3/dump.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Lib/sqlite3/dump.py b/Lib/sqlite3/dump.py index 1e0ff3915792c1..21ed05383e97a3 100644 --- a/Lib/sqlite3/dump.py +++ b/Lib/sqlite3/dump.py @@ -7,6 +7,8 @@ # future enhancements, you should normally quote any identifier that # is an English language word, even if you do not have to." +from contextlib import contextmanager + def _quote_name(name): return '"{0}"'.format(name.replace('"', '""')) @@ -20,18 +22,14 @@ def _force_decode(bs, *args, **kwargs): except UnicodeDecodeError: return "".join([chr(c) for c in bs]) -class _ctx_text_factory: - def __init__(self, connection, text_factory): - self.connection = connection - self.text_factory = text_factory - self.orig_text_factory = None - - def __enter__(self): - self.orig_text_factory = self.connection.text_factory - self.connection.text_factory = self.text_factory - - def __exit__(self, type, value, tb): - self.connection.text_factory = self.orig_text_factory +@contextmanager +def _text_factory(con, factory): + saved_factory = con.text_factory + con.text_factory = factory + try: + yield + finally: + con.text_factory = saved_factory def _iterdump(connection): """ @@ -92,7 +90,7 @@ def _iterdump(connection): ) ) query_res = cu.execute(q) - with _ctx_text_factory(connection, bytes): + with _text_factory(connection, bytes): for row in query_res: yield("{0};".format(_force_decode(row[0]))) From c1b730d28abc136e550324d165aed0caef4e4b5b Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Wed, 30 Aug 2023 10:24:02 +0200 Subject: [PATCH 7/8] Style nits and a comment --- Lib/sqlite3/dump.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Lib/sqlite3/dump.py b/Lib/sqlite3/dump.py index 21ed05383e97a3..481d605194c7fe 100644 --- a/Lib/sqlite3/dump.py +++ b/Lib/sqlite3/dump.py @@ -7,8 +7,10 @@ # future enhancements, you should normally quote any identifier that # is an English language word, even if you do not have to." + from contextlib import contextmanager + def _quote_name(name): return '"{0}"'.format(name.replace('"', '""')) @@ -16,12 +18,15 @@ def _quote_name(name): def _quote_value(value): return "'{0}'".format(value.replace("'", "''")) + def _force_decode(bs, *args, **kwargs): + # gh-108590: Don't fail if the database contains invalid Unicode data. try: return bs.decode(*args, **kwargs) except UnicodeDecodeError: return "".join([chr(c) for c in bs]) + @contextmanager def _text_factory(con, factory): saved_factory = con.text_factory @@ -31,6 +36,7 @@ def _text_factory(con, factory): finally: con.text_factory = saved_factory + def _iterdump(connection): """ Returns an iterator to the dump of the database in an SQL text format. From 365b16e9047bb361f8db30e37e2f11680989aaa6 Mon Sep 17 00:00:00 2001 From: Corvin McPherson Date: Wed, 30 Aug 2023 19:36:23 +0000 Subject: [PATCH 8/8] Add dump reproducability test --- Lib/test/test_sqlite3/test_dump.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Lib/test/test_sqlite3/test_dump.py b/Lib/test/test_sqlite3/test_dump.py index 0279ce68eeb5f1..6402e9f0c390fa 100644 --- a/Lib/test/test_sqlite3/test_dump.py +++ b/Lib/test/test_sqlite3/test_dump.py @@ -148,6 +148,19 @@ def test_dump_unicode_invalid(self): actual = list(self.cx.iterdump()) self.assertEqual(expected, actual) + def test_dump_recreation(self): + self.cu.executescript(""" + CREATE TABLE foo (id INTEGER, text TEXT, blob BLOB); + INSERT INTO foo VALUES (0, CAST(X'619f' AS TEXT), X'619f'); + INSERT INTO foo VALUES (1, 'Hello SQLite!', X'98194eff46ab29f79064'); + """) + original_dump = list(self.cx.iterdump()) + with memory_database() as cx2: + query = "".join(original_dump) + cx2.executescript(query) + recreation_dump = list(cx2.iterdump()) + self.assertEqual(original_dump, recreation_dump) + if __name__ == "__main__": unittest.main()