From 06c269605896a39f526615b35d99856f2aad8b81 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sun, 31 May 2020 17:55:41 +0100 Subject: [PATCH 01/11] Use arrow parquet.read_table opposed to ParquetDataset --- pandas/io/parquet.py | 9 ++++----- pandas/tests/io/test_parquet.py | 8 ++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cde7a98eb42ae..f1dba3690320b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -122,11 +122,10 @@ def write( file_obj_or_path.close() def read(self, path, columns=None, **kwargs): - parquet_ds = self.api.parquet.ParquetDataset( - path, filesystem=get_fs_for_path(path), **kwargs - ) - kwargs["columns"] = columns - result = parquet_ds.read_pandas(**kwargs).to_pandas() + kwargs["use_pandas_metadata"] = True + result = self.api.parquet.read_table( + path, columns=columns, filesystem=get_fs_for_path(path), **kwargs + ).to_pandas() return result diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8a43d4079159b..8e32c72710ded 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,6 +1,7 @@ """ test parquet compat """ import datetime from distutils.version import LooseVersion +from io import BytesIO import os from warnings import catch_warnings @@ -567,6 +568,13 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): repeat=1, ) + def test_file_like_obj_support(self, df_compat): + buffer = BytesIO() + df_compat.to_parquet(buffer) + df_from_buf = pd.read_parquet(buffer) + print(df_from_buf) + tm.assert_frame_equal(df_compat, df_from_buf) + def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] From 3f1496bd7213fb3496d70ad38ea72689039f81c8 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 1 Jun 2020 18:13:11 +0100 Subject: [PATCH 02/11] Importer skip --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8e32c72710ded..ba80f508e61e9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -568,11 +568,11 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): repeat=1, ) + @td.skip_if_no("pyarrow") def test_file_like_obj_support(self, df_compat): buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = pd.read_parquet(buffer) - print(df_from_buf) tm.assert_frame_equal(df_compat, df_from_buf) def test_partition_cols_supported(self, pa, df_full): From 8122015869d3e917746180977d1899cb7ace70bc Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 1 Jun 2020 23:20:07 +0100 Subject: [PATCH 03/11] Add simple parquet file for read url tests --- pandas/tests/io/data/parquet/simple.parquet | Bin 0 -> 2157 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/io/data/parquet/simple.parquet diff --git a/pandas/tests/io/data/parquet/simple.parquet b/pandas/tests/io/data/parquet/simple.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2862a91f508ea225ea9829d4843f330473977134 GIT binary patch literal 2157 zcmcJRUvJ_@5Wo#VBzKDX5S@{cc!;b!bW#b$B;?BJ`miPeOo0SQuyIvI{ug4fjn9AL zkSH&G?kA}Ff%+M$s!x6Fhv@7YXh`Y9>2xc>&d$uv{ARslI7fYsPSGX0*rwBTYLB9r zRDybu_>iRHv9*|Kihe~i1?&W$bdK3sT9}>0!Y4z$-Jk{I%hxzuI+5IZP38YemH#!Bqj#5VJYl=f{FKQ5ww3>7E4Q|BL;7U)Lxp0}zIJ2S z3Y`L;H%TawqIdDkzoFVW$d^fHbJ@ZdJciG!BJ(Fa5uk(l6-8wWOxjf(UGbTpQxB}^ z=*HcZHnT$8@!9wLKRmEg(1FByiZeFY!anOLwgCz!v@BTpf#LTy}YO&e9JyQfmx?u5#GKA5v!gtm`-qfM9yH1V5NQnfRE>~033^=SpL5$ zi0WC2Pb-sTAxG>PN-*>iZ{oFyK}|!W)V((Yk0mnbFj8l}G12iTdTNib@^ocC z?cx2fhC!kbyMSO5j}Xix2+T{T5lDPA5-Zb@)B!ny{ro$@wnDivJd(=YTeUe1RHtK} znY?PuTb=^G!ellUoYswM>y}_fW5KT6sNyB|33c(#Z`Y8i9rkZDrw2L|E!MYMO*j@# zI=)bA-Rq5EgyVE1PRI69x$5g87fR(gw=Au4h9i-)u&?S2eYJ-3$*wHkO{MaNyp1}r z*YBmneno3wCS>Yj#fA7&c_852z{o|$uv#)Cp2QXGs;St+iC!c2K$iM95Ti620B_Sb zWwA0*%Y9e1H4<+)>^IsMWApkMyEX^^dI0CQ_={u3|ghvk5N>EZr>fm??s z`$4+I{Q~{h#Qg(f>4g1iEv(^5sJP$o8vIX4pJ|8$Av$PZJh1C2as7^B;LYHD42|GG lbKU*L>tUZOR*Kxgpo`82&9jSa0e Date: Tue, 2 Jun 2020 10:05:17 +0100 Subject: [PATCH 04/11] Parquet read from url tests --- pandas/io/parquet.py | 4 ++++ pandas/tests/io/test_parquet.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f1dba3690320b..af1ab092880d7 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -122,10 +122,14 @@ def write( file_obj_or_path.close() def read(self, path, columns=None, **kwargs): + path, _, _, should_close = get_filepath_or_buffer(path) + kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( path, columns=columns, filesystem=get_fs_for_path(path), **kwargs ).to_pandas() + if should_close: + path.close() return result diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ba80f508e61e9..a4b642657c955 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -462,6 +462,13 @@ def test_write_ignoring_index(self, engine): expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + @tm.network + def test_parquet_read_from_url(self, engine, df_compat): + # TODO:alimcmaster1 update with master URL + url = "https://raw.githubusercontent.com/alimcmaster1/pandas/mcmali-parq-fix/pandas/tests/io/data/parquet/simple.parquet" + df = pd.read_parquet(url, engine=engine) + tm.assert_frame_equal(df, df_compat) + class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): From daeb150b20bd8b1f49f72f8eda1e1cdf4d5f357d Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 2 Jun 2020 19:30:08 +0100 Subject: [PATCH 05/11] Handle S3 URLs seperately --- pandas/io/parquet.py | 10 ++++++++-- pandas/tests/io/test_parquet.py | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index af1ab092880d7..de9a14c82b3cb 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -122,14 +122,20 @@ def write( file_obj_or_path.close() def read(self, path, columns=None, **kwargs): - path, _, _, should_close = get_filepath_or_buffer(path) + fs = get_fs_for_path(path) + should_close = None + # Avoid calling get_filepath_or_buffer for s3/gcs URLs since + # since it returns an S3File which doesn't support dir reads in arrow + if not fs: + path, _, _, should_close = get_filepath_or_buffer(path) kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( - path, columns=columns, filesystem=get_fs_for_path(path), **kwargs + path, columns=columns, filesystem=fs, **kwargs ).to_pandas() if should_close: path.close() + return result diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a4b642657c955..6706c3635c96e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -465,7 +465,10 @@ def test_write_ignoring_index(self, engine): @tm.network def test_parquet_read_from_url(self, engine, df_compat): # TODO:alimcmaster1 update with master URL - url = "https://raw.githubusercontent.com/alimcmaster1/pandas/mcmali-parq-fix/pandas/tests/io/data/parquet/simple.parquet" + url = ( + "https://raw.githubusercontent.com/alimcmaster1/pandas/" + "mcmali-parq-fix/pandas/tests/io/data/parquet/simple.parquet" + ) df = pd.read_parquet(url, engine=engine) tm.assert_frame_equal(df, df_compat) From ee32b3d79093b9e3774c223cc7262ce26094932b Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 2 Jun 2020 19:37:13 +0100 Subject: [PATCH 06/11] Add whatsnew --- doc/source/whatsnew/v1.0.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.5.rst b/doc/source/whatsnew/v1.0.5.rst index 1edc7e1cad72f..3472b0d10f885 100644 --- a/doc/source/whatsnew/v1.0.5.rst +++ b/doc/source/whatsnew/v1.0.5.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fix regression in :meth:`read_parquet` would raise a ``TypeError`` when passing in a file-`like object (:issue:`34467`) - .. _whatsnew_105.bug_fixes: From 9fa31780d9a6ed68ac14acf10e11ce075e8270fd Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 2 Jun 2020 20:05:49 +0100 Subject: [PATCH 07/11] Read file like fastparquet and pyarrow --- pandas/tests/io/test_parquet.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6706c3635c96e..ad7753c052cfd 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -472,6 +472,12 @@ def test_parquet_read_from_url(self, engine, df_compat): df = pd.read_parquet(url, engine=engine) tm.assert_frame_equal(df, df_compat) + def test_read_file_like_obj_support(self, df_compat, engine): + buffer = BytesIO() + df_compat.to_parquet(buffer) + df_from_buf = pd.read_parquet(buffer, engine=engine) + tm.assert_frame_equal(df_compat, df_from_buf) + class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): @@ -578,13 +584,6 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): repeat=1, ) - @td.skip_if_no("pyarrow") - def test_file_like_obj_support(self, df_compat): - buffer = BytesIO() - df_compat.to_parquet(buffer) - df_from_buf = pd.read_parquet(buffer) - tm.assert_frame_equal(df_compat, df_from_buf) - def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] From 6ee9974ba0a880719814724d0c58e3ab8919db67 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Tue, 2 Jun 2020 23:32:43 +0100 Subject: [PATCH 08/11] Test just pyarrow --- pandas/tests/io/test_parquet.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ad7753c052cfd..16fcafd39043c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -462,22 +462,6 @@ def test_write_ignoring_index(self, engine): expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) - @tm.network - def test_parquet_read_from_url(self, engine, df_compat): - # TODO:alimcmaster1 update with master URL - url = ( - "https://raw.githubusercontent.com/alimcmaster1/pandas/" - "mcmali-parq-fix/pandas/tests/io/data/parquet/simple.parquet" - ) - df = pd.read_parquet(url, engine=engine) - tm.assert_frame_equal(df, df_compat) - - def test_read_file_like_obj_support(self, df_compat, engine): - buffer = BytesIO() - df_compat.to_parquet(buffer) - df_from_buf = pd.read_parquet(buffer, engine=engine) - tm.assert_frame_equal(df_compat, df_from_buf) - class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): @@ -584,6 +568,22 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): repeat=1, ) + @tm.network + def test_parquet_read_from_url(self, engine, df_compat): + # TODO:alimcmaster1 update with master URL + url = ( + "https://raw.githubusercontent.com/alimcmaster1/pandas/" + "mcmali-parq-fix/pandas/tests/io/data/parquet/simple.parquet" + ) + df = pd.read_parquet(url, engine=engine) + tm.assert_frame_equal(df, df_compat) + + def test_read_file_like_obj_support(self, df_compat, engine): + buffer = BytesIO() + df_compat.to_parquet(buffer) + df_from_buf = pd.read_parquet(buffer, engine=engine) + tm.assert_frame_equal(df_compat, df_from_buf) + def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] From 92a883d48a9e8d184a0b6f2fcac3f79239c17040 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 3 Jun 2020 09:12:13 +0100 Subject: [PATCH 09/11] Skip if no arrow --- pandas/tests/io/test_parquet.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 16fcafd39043c..027c905b0f3c9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -569,19 +569,21 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): ) @tm.network - def test_parquet_read_from_url(self, engine, df_compat): + @td.skip_if_no("pyarrow") + def test_parquet_read_from_url(self, df_compat): # TODO:alimcmaster1 update with master URL url = ( "https://raw.githubusercontent.com/alimcmaster1/pandas/" "mcmali-parq-fix/pandas/tests/io/data/parquet/simple.parquet" ) - df = pd.read_parquet(url, engine=engine) + df = pd.read_parquet(url) tm.assert_frame_equal(df, df_compat) - def test_read_file_like_obj_support(self, df_compat, engine): + @td.skip_if_no("pyarrow") + def test_read_file_like_obj_support(self, df_compat): buffer = BytesIO() df_compat.to_parquet(buffer) - df_from_buf = pd.read_parquet(buffer, engine=engine) + df_from_buf = pd.read_parquet(buffer) tm.assert_frame_equal(df_compat, df_from_buf) def test_partition_cols_supported(self, pa, df_full): From 5a15f4f6ac8527659ae7048193a8d96480381d39 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Jun 2020 20:10:10 +0200 Subject: [PATCH 10/11] remove whatsnew --- doc/source/whatsnew/v1.0.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.5.rst b/doc/source/whatsnew/v1.0.5.rst index 3472b0d10f885..1edc7e1cad72f 100644 --- a/doc/source/whatsnew/v1.0.5.rst +++ b/doc/source/whatsnew/v1.0.5.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fix regression in :meth:`read_parquet` would raise a ``TypeError`` when passing in a file-`like object (:issue:`34467`) +- - .. _whatsnew_105.bug_fixes: From 882f5a81aa894d530447a6bca5a74c935cfba790 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Jun 2020 20:14:42 +0200 Subject: [PATCH 11/11] update url --- pandas/tests/io/test_parquet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 027c905b0f3c9..7ee551194bf76 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -571,10 +571,9 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): @tm.network @td.skip_if_no("pyarrow") def test_parquet_read_from_url(self, df_compat): - # TODO:alimcmaster1 update with master URL url = ( - "https://raw.githubusercontent.com/alimcmaster1/pandas/" - "mcmali-parq-fix/pandas/tests/io/data/parquet/simple.parquet" + "https://raw.githubusercontent.com/pandas-dev/pandas/" + "master/pandas/tests/io/data/parquet/simple.parquet" ) df = pd.read_parquet(url) tm.assert_frame_equal(df, df_compat)