@@ -910,112 +910,183 @@ def read_csv(
910
910
engine = engine ,
911
911
write_engine = write_engine ,
912
912
)
913
- if engine is not None and engine == "bigquery" :
914
- if any (param is not None for param in (dtype , names )):
915
- not_supported = ("dtype" , "names" )
916
- raise NotImplementedError (
917
- f"BigQuery engine does not support these arguments: { not_supported } . "
918
- f"{ constants .FEEDBACK_LINK } "
919
- )
920
913
921
- # TODO(b/338089659): Looks like we can relax this 1 column
922
- # restriction if we check the contents of an iterable are strings
923
- # not integers.
924
- if (
925
- # Empty tuples, None, and False are allowed and falsey.
926
- index_col
927
- and not isinstance (index_col , bigframes .enums .DefaultIndexKind )
928
- and not isinstance (index_col , str )
929
- ):
930
- raise NotImplementedError (
931
- "BigQuery engine only supports a single column name for `index_col`, "
932
- f"got: { repr (index_col )} . { constants .FEEDBACK_LINK } "
933
- )
914
+ if engine != "bigquery" :
915
+ # Using pandas.read_csv by default and warning about potential issues with
916
+ # large files.
917
+ return self ._read_csv_w_pandas_engines (
918
+ filepath_or_buffer ,
919
+ sep = sep ,
920
+ header = header ,
921
+ names = names ,
922
+ index_col = index_col ,
923
+ usecols = usecols , # type: ignore
924
+ dtype = dtype ,
925
+ engine = engine ,
926
+ encoding = encoding ,
927
+ write_engine = write_engine ,
928
+ ** kwargs ,
929
+ )
930
+ else :
931
+ return self ._read_csv_w_bigquery_engine (
932
+ filepath_or_buffer ,
933
+ sep = sep ,
934
+ header = header ,
935
+ names = names ,
936
+ index_col = index_col ,
937
+ usecols = usecols , # type: ignore
938
+ dtype = dtype ,
939
+ encoding = encoding ,
940
+ )
934
941
935
- # None and False cannot be passed to read_gbq.
936
- # TODO(b/338400133): When index_col is None, we should be using the
937
- # first column of the CSV as the index to be compatible with the
938
- # pandas engine. According to the pandas docs, only "False"
939
- # indicates a default sequential index.
940
- if not index_col :
941
- index_col = ()
942
+ def _read_csv_w_pandas_engines (
943
+ self ,
944
+ filepath_or_buffer ,
945
+ * ,
946
+ sep ,
947
+ header ,
948
+ names ,
949
+ index_col ,
950
+ usecols ,
951
+ dtype ,
952
+ engine ,
953
+ encoding ,
954
+ write_engine ,
955
+ ** kwargs ,
956
+ ) -> dataframe .DataFrame :
957
+ """Reads a CSV file using pandas engines into a BigQuery DataFrames.
942
958
943
- index_col = typing .cast (
944
- Union [
945
- Sequence [str ], # Falsey values
946
- bigframes .enums .DefaultIndexKind ,
947
- str ,
948
- ],
949
- index_col ,
959
+ This method serves as the implementation backend for read_csv when the
960
+ specified engine is one supported directly by pandas ('c', 'python',
961
+ 'pyarrow').
962
+ """
963
+ if isinstance (index_col , bigframes .enums .DefaultIndexKind ):
964
+ raise NotImplementedError (
965
+ f"With index_col={ repr (index_col )} , only engine='bigquery' is supported. "
966
+ f"{ constants .FEEDBACK_LINK } "
950
967
)
968
+ if any (arg in kwargs for arg in ("chunksize" , "iterator" )):
969
+ raise NotImplementedError (
970
+ "'chunksize' and 'iterator' arguments are not supported. "
971
+ f"{ constants .FEEDBACK_LINK } "
972
+ )
973
+ if isinstance (filepath_or_buffer , str ):
974
+ self ._check_file_size (filepath_or_buffer )
951
975
952
- # usecols should only be an iterable of strings (column names) for use as columns in read_gbq.
953
- columns : Tuple [Any , ...] = tuple ()
954
- if usecols is not None :
955
- if isinstance (usecols , Iterable ) and all (
956
- isinstance (col , str ) for col in usecols
957
- ):
958
- columns = tuple (col for col in usecols )
959
- else :
960
- raise NotImplementedError (
961
- "BigQuery engine only supports an iterable of strings for `usecols`. "
962
- f"{ constants .FEEDBACK_LINK } "
963
- )
976
+ pandas_df = pandas .read_csv (
977
+ filepath_or_buffer ,
978
+ sep = sep ,
979
+ header = header ,
980
+ names = names ,
981
+ index_col = index_col ,
982
+ usecols = usecols , # type: ignore
983
+ dtype = dtype ,
984
+ engine = engine ,
985
+ encoding = encoding ,
986
+ ** kwargs ,
987
+ )
988
+ return self ._read_pandas (pandas_df , api_name = "read_csv" , write_engine = write_engine ) # type: ignore
964
989
965
- if encoding is not None and encoding not in _VALID_ENCODINGS :
966
- raise NotImplementedError (
967
- f"BigQuery engine only supports the following encodings: { _VALID_ENCODINGS } . "
968
- f"{ constants .FEEDBACK_LINK } "
969
- )
990
+ def _read_csv_w_bigquery_engine (
991
+ self ,
992
+ filepath_or_buffer ,
993
+ * ,
994
+ sep ,
995
+ header ,
996
+ names ,
997
+ index_col ,
998
+ usecols ,
999
+ dtype ,
1000
+ encoding ,
1001
+ ) -> dataframe .DataFrame :
1002
+ """Reads a CSV file using the BigQuery engine into a BigQuery DataFrames.
970
1003
971
- job_config = bigquery .LoadJobConfig ()
972
- job_config .source_format = bigquery .SourceFormat .CSV
973
- job_config .autodetect = True
974
- job_config .field_delimiter = sep
975
- job_config .encoding = encoding
976
- job_config .labels = {"bigframes-api" : "read_csv" }
1004
+ This method serves as the implementation backend for read_csv when the
1005
+ 'bigquery' engine is specified or inferred. It leverages BigQuery's
1006
+ native CSV loading capabilities, making it suitable for large datasets
1007
+ that may not fit into local memory.
1008
+ """
977
1009
978
- # We want to match pandas behavior. If header is 0, no rows should be skipped, so we
979
- # do not need to set `skip_leading_rows`. If header is None, then there is no header.
980
- # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows.
981
- if header is None :
982
- job_config .skip_leading_rows = 0
983
- elif header > 0 :
984
- job_config .skip_leading_rows = header
1010
+ if any (param is not None for param in (dtype , names )):
1011
+ not_supported = ("dtype" , "names" )
1012
+ raise NotImplementedError (
1013
+ f"BigQuery engine does not support these arguments: { not_supported } . "
1014
+ f"{ constants .FEEDBACK_LINK } "
1015
+ )
985
1016
986
- return self ._loader .read_bigquery_load_job (
987
- filepath_or_buffer ,
988
- job_config = job_config ,
989
- index_col = index_col ,
990
- columns = columns ,
1017
+ # TODO(b/338089659): Looks like we can relax this 1 column
1018
+ # restriction if we check the contents of an iterable are strings
1019
+ # not integers.
1020
+ if (
1021
+ # Empty tuples, None, and False are allowed and falsey.
1022
+ index_col
1023
+ and not isinstance (index_col , bigframes .enums .DefaultIndexKind )
1024
+ and not isinstance (index_col , str )
1025
+ ):
1026
+ raise NotImplementedError (
1027
+ "BigQuery engine only supports a single column name for `index_col`, "
1028
+ f"got: { repr (index_col )} . { constants .FEEDBACK_LINK } "
991
1029
)
992
- else :
993
- if isinstance (index_col , bigframes .enums .DefaultIndexKind ):
994
- raise NotImplementedError (
995
- f"With index_col={ repr (index_col )} , only engine='bigquery' is supported. "
996
- f"{ constants .FEEDBACK_LINK } "
997
- )
998
- if any (arg in kwargs for arg in ("chunksize" , "iterator" )):
1030
+
1031
+ # None and False cannot be passed to read_gbq.
1032
+ # TODO(b/338400133): When index_col is None, we should be using the
1033
+ # first column of the CSV as the index to be compatible with the
1034
+ # pandas engine. According to the pandas docs, only "False"
1035
+ # indicates a default sequential index.
1036
+ if not index_col :
1037
+ index_col = ()
1038
+
1039
+ index_col = typing .cast (
1040
+ Union [
1041
+ Sequence [str ], # Falsey values
1042
+ bigframes .enums .DefaultIndexKind ,
1043
+ str ,
1044
+ ],
1045
+ index_col ,
1046
+ )
1047
+
1048
+ # usecols should only be an iterable of strings (column names) for use as columns in read_gbq.
1049
+ columns : Tuple [Any , ...] = tuple ()
1050
+ if usecols is not None :
1051
+ if isinstance (usecols , Iterable ) and all (
1052
+ isinstance (col , str ) for col in usecols
1053
+ ):
1054
+ columns = tuple (col for col in usecols )
1055
+ else :
999
1056
raise NotImplementedError (
1000
- "'chunksize' and 'iterator' arguments are not supported . "
1057
+ "BigQuery engine only supports an iterable of strings for `usecols` . "
1001
1058
f"{ constants .FEEDBACK_LINK } "
1002
1059
)
1003
1060
1004
- if isinstance (filepath_or_buffer , str ):
1005
- self ._check_file_size (filepath_or_buffer )
1006
- pandas_df = pandas .read_csv (
1007
- filepath_or_buffer ,
1008
- sep = sep ,
1009
- header = header ,
1010
- names = names ,
1011
- index_col = index_col ,
1012
- usecols = usecols , # type: ignore
1013
- dtype = dtype ,
1014
- engine = engine ,
1015
- encoding = encoding ,
1016
- ** kwargs ,
1061
+ if encoding is not None and encoding not in _VALID_ENCODINGS :
1062
+ raise NotImplementedError (
1063
+ f"BigQuery engine only supports the following encodings: { _VALID_ENCODINGS } . "
1064
+ f"{ constants .FEEDBACK_LINK } "
1017
1065
)
1018
- return self ._read_pandas (pandas_df , api_name = "read_csv" , write_engine = write_engine ) # type: ignore
1066
+
1067
+ job_config = bigquery .LoadJobConfig ()
1068
+ job_config .source_format = bigquery .SourceFormat .CSV
1069
+ job_config .autodetect = True
1070
+ job_config .field_delimiter = sep
1071
+ job_config .encoding = encoding
1072
+ job_config .labels = {"bigframes-api" : "read_csv" }
1073
+
1074
+ # b/409070192: When header > 0, pandas and BigFrames returns different column naming.
1075
+
1076
+ # We want to match pandas behavior. If header is 0, no rows should be skipped, so we
1077
+ # do not need to set `skip_leading_rows`. If header is None, then there is no header.
1078
+ # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows.
1079
+ if header is None :
1080
+ job_config .skip_leading_rows = 0
1081
+ elif header > 0 :
1082
+ job_config .skip_leading_rows = header + 1
1083
+
1084
+ return self ._loader .read_bigquery_load_job (
1085
+ filepath_or_buffer ,
1086
+ job_config = job_config ,
1087
+ index_col = index_col ,
1088
+ columns = columns ,
1089
+ )
1019
1090
1020
1091
def read_pickle (
1021
1092
self ,
0 commit comments