diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 2567653c0..108063d4d 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -1,3 +1,3 @@ docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:87eee22d276554e4e52863ec9b1cb6a7245815dfae20439712bf644348215a5a + digest: sha256:4ee57a76a176ede9087c14330c625a71553cf9c72828b2c0ca12f5338171ba60 diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index 6572e5982..01affbae5 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -1,9 +1,12 @@ # https://github.com/googleapis/repo-automation-bots/tree/main/packages/sync-repo-settings +# Allow merge commits to sync main and v3 with fewer conflicts. +mergeCommitAllowed: true # Rules for main branch protection branchProtectionRules: # Identifies the protection rule pattern. Name of the branch to be protected. # Defaults to `main` - pattern: main + requiresLinearHistory: true requiresCodeOwnerReviews: true requiresStrictStatusChecks: true requiredStatusCheckContexts: @@ -15,6 +18,7 @@ branchProtectionRules: - 'Samples - Python 3.7' - 'Samples - Python 3.8' - pattern: v3 + requiresLinearHistory: false requiresCodeOwnerReviews: true requiresStrictStatusChecks: true requiredStatusCheckContexts: diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg index 0c99ae611..41b86fc29 100644 --- a/.kokoro/docs/common.cfg +++ b/.kokoro/docs/common.cfg @@ -30,6 +30,7 @@ env_vars: { env_vars: { key: "V2_STAGING_BUCKET" + # Push google cloud library docs to the Cloud RAD bucket `docs-staging-v2` value: "docs-staging-v2" } diff --git a/.kokoro/samples/lint/common.cfg b/.kokoro/samples/lint/common.cfg index 3e41df313..153746ccc 100644 --- a/.kokoro/samples/lint/common.cfg +++ b/.kokoro/samples/lint/common.cfg @@ -31,4 +31,4 @@ gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline.sh" \ No newline at end of file +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.10/common.cfg b/.kokoro/samples/python3.10/common.cfg new file mode 100644 index 000000000..da4003d76 --- /dev/null +++ b/.kokoro/samples/python3.10/common.cfg @@ -0,0 +1,40 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Build logs will be here +action { + define_artifacts { + regex: "**/*sponge_log.xml" + } +} + +# Specify which tests to run +env_vars: { + key: "RUN_TESTS_SESSION" + value: "py-3.10" +} + +# Declare build specific Cloud project. +env_vars: { + key: "BUILD_SPECIFIC_GCLOUD_PROJECT" + value: "python-docs-samples-tests-310" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/test-samples.sh" +} + +# Configure the docker image for kokoro-trampoline. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" +} + +# Download secrets for samples +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" + +# Download trampoline resources. +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" + +# Use the trampoline script to run in docker. +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.10/continuous.cfg b/.kokoro/samples/python3.10/continuous.cfg new file mode 100644 index 000000000..a1c8d9759 --- /dev/null +++ b/.kokoro/samples/python3.10/continuous.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file diff --git a/.kokoro/samples/python3.10/periodic-head.cfg b/.kokoro/samples/python3.10/periodic-head.cfg new file mode 100644 index 000000000..5aa01bab5 --- /dev/null +++ b/.kokoro/samples/python3.10/periodic-head.cfg @@ -0,0 +1,11 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" +} diff --git a/.kokoro/samples/python3.10/periodic.cfg b/.kokoro/samples/python3.10/periodic.cfg new file mode 100644 index 000000000..71cd1e597 --- /dev/null +++ b/.kokoro/samples/python3.10/periodic.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "False" +} diff --git a/.kokoro/samples/python3.10/presubmit.cfg b/.kokoro/samples/python3.10/presubmit.cfg new file mode 100644 index 000000000..a1c8d9759 --- /dev/null +++ b/.kokoro/samples/python3.10/presubmit.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file diff --git a/.kokoro/samples/python3.6/common.cfg b/.kokoro/samples/python3.6/common.cfg index f3b930960..20f6b9691 100644 --- a/.kokoro/samples/python3.6/common.cfg +++ b/.kokoro/samples/python3.6/common.cfg @@ -37,4 +37,4 @@ gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline.sh" \ No newline at end of file +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.6/periodic.cfg b/.kokoro/samples/python3.6/periodic.cfg index 50fec9649..71cd1e597 100644 --- a/.kokoro/samples/python3.6/periodic.cfg +++ b/.kokoro/samples/python3.6/periodic.cfg @@ -3,4 +3,4 @@ env_vars: { key: "INSTALL_LIBRARY_FROM_SOURCE" value: "False" -} \ No newline at end of file +} diff --git a/.kokoro/samples/python3.7/common.cfg b/.kokoro/samples/python3.7/common.cfg index fc0654565..d30dc6018 100644 --- a/.kokoro/samples/python3.7/common.cfg +++ b/.kokoro/samples/python3.7/common.cfg @@ -37,4 +37,4 @@ gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline.sh" \ No newline at end of file +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.7/periodic.cfg b/.kokoro/samples/python3.7/periodic.cfg index 50fec9649..71cd1e597 100644 --- a/.kokoro/samples/python3.7/periodic.cfg +++ b/.kokoro/samples/python3.7/periodic.cfg @@ -3,4 +3,4 @@ env_vars: { key: "INSTALL_LIBRARY_FROM_SOURCE" value: "False" -} \ No newline at end of file +} diff --git a/.kokoro/samples/python3.8/common.cfg b/.kokoro/samples/python3.8/common.cfg index 2b0bf59b3..46759c6d6 100644 --- a/.kokoro/samples/python3.8/common.cfg +++ b/.kokoro/samples/python3.8/common.cfg @@ -37,4 +37,4 @@ gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline.sh" \ No newline at end of file +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.8/periodic.cfg b/.kokoro/samples/python3.8/periodic.cfg index 50fec9649..71cd1e597 100644 --- a/.kokoro/samples/python3.8/periodic.cfg +++ b/.kokoro/samples/python3.8/periodic.cfg @@ -3,4 +3,4 @@ env_vars: { key: "INSTALL_LIBRARY_FROM_SOURCE" value: "False" -} \ No newline at end of file +} diff --git a/.kokoro/samples/python3.9/common.cfg b/.kokoro/samples/python3.9/common.cfg index f179577a5..58d56ce74 100644 --- a/.kokoro/samples/python3.9/common.cfg +++ b/.kokoro/samples/python3.9/common.cfg @@ -37,4 +37,4 @@ gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery/.kokoro/trampoline.sh" \ No newline at end of file +build_file: "python-bigquery/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.9/periodic.cfg b/.kokoro/samples/python3.9/periodic.cfg index 50fec9649..71cd1e597 100644 --- a/.kokoro/samples/python3.9/periodic.cfg +++ b/.kokoro/samples/python3.9/periodic.cfg @@ -3,4 +3,4 @@ env_vars: { key: "INSTALL_LIBRARY_FROM_SOURCE" value: "False" -} \ No newline at end of file +} diff --git a/.kokoro/test-samples-against-head.sh b/.kokoro/test-samples-against-head.sh index 689948a23..ba3a707b0 100755 --- a/.kokoro/test-samples-against-head.sh +++ b/.kokoro/test-samples-against-head.sh @@ -23,6 +23,4 @@ set -eo pipefail # Enables `**` to include files nested inside sub-folders shopt -s globstar -cd github/python-bigquery - exec .kokoro/test-samples-impl.sh diff --git a/.kokoro/test-samples.sh b/.kokoro/test-samples.sh index 62ef534cd..11c042d34 100755 --- a/.kokoro/test-samples.sh +++ b/.kokoro/test-samples.sh @@ -24,8 +24,6 @@ set -eo pipefail # Enables `**` to include files nested inside sub-folders shopt -s globstar -cd github/python-bigquery - # Run periodic samples tests at latest release if [[ $KOKORO_BUILD_ARTIFACTS_SUBDIR = *"periodic"* ]]; then # preserving the test runner implementation. diff --git a/.repo-metadata.json b/.repo-metadata.json index f132056d5..124b40eb9 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -1,14 +1,16 @@ { - "name": "bigquery", - "name_pretty": "Google Cloud BigQuery", - "product_documentation": "https://cloud.google.com/bigquery", - "client_documentation": "https://googleapis.dev/python/bigquery/latest", - "issue_tracker": "https://issuetracker.google.com/savedsearches/559654", - "release_level": "ga", - "language": "python", - "library_type": "GAPIC_COMBO", - "repo": "googleapis/python-bigquery", - "distribution_name": "google-cloud-bigquery", - "api_id": "bigquery.googleapis.com", - "requires_billing": false -} \ No newline at end of file + "name": "bigquery", + "name_pretty": "Google Cloud BigQuery", + "product_documentation": "https://cloud.google.com/bigquery", + "client_documentation": "https://googleapis.dev/python/bigquery/latest", + "issue_tracker": "https://issuetracker.google.com/savedsearches/559654", + "release_level": "ga", + "language": "python", + "library_type": "GAPIC_COMBO", + "repo": "googleapis/python-bigquery", + "distribution_name": "google-cloud-bigquery", + "api_id": "bigquery.googleapis.com", + "requires_billing": false, + "default_version": "v2", + "codeowner_team": "@googleapis/api-bigquery" +} diff --git a/.trampolinerc b/.trampolinerc index 383b6ec89..0eee72ab6 100644 --- a/.trampolinerc +++ b/.trampolinerc @@ -16,15 +16,26 @@ # Add required env vars here. required_envvars+=( - "STAGING_BUCKET" - "V2_STAGING_BUCKET" ) # Add env vars which are passed down into the container here. pass_down_envvars+=( + "NOX_SESSION" + ############### + # Docs builds + ############### "STAGING_BUCKET" "V2_STAGING_BUCKET" - "NOX_SESSION" + ################## + # Samples builds + ################## + "INSTALL_LIBRARY_FROM_SOURCE" + "RUN_TESTS_SESSION" + "BUILD_SPECIFIC_GCLOUD_PROJECT" + # Target directories. + "RUN_TESTS_DIRS" + # The nox session to run. + "RUN_TESTS_SESSION" ) # Prevent unintentional override on the default image. diff --git a/CHANGELOG.md b/CHANGELOG.md index d531ec477..d15f22851 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +### [2.28.1](https://www.github.com/googleapis/python-bigquery/compare/v2.28.0...v2.28.1) (2021-10-07) + + +### Bug Fixes + +* support ARRAY data type when loading from DataFrame with Parquet ([#980](https://www.github.com/googleapis/python-bigquery/issues/980)) ([1e59083](https://www.github.com/googleapis/python-bigquery/commit/1e5908302d36e15442013af6f46b1c20af28255e)) + +## [2.28.0](https://www.github.com/googleapis/python-bigquery/compare/v2.27.1...v2.28.0) (2021-09-30) + + +### Features + +* add `AvroOptions` to configure AVRO external data ([#994](https://www.github.com/googleapis/python-bigquery/issues/994)) ([1a9431d](https://www.github.com/googleapis/python-bigquery/commit/1a9431d9e02eeb99e4712b61c623f9cca80134a6)) + + +### Documentation + +* link to stable pandas docs ([#990](https://www.github.com/googleapis/python-bigquery/issues/990)) ([ea50e80](https://www.github.com/googleapis/python-bigquery/commit/ea50e8031fc035b3772a338bc00982de263cefad)) + ### [2.27.1](https://www.github.com/googleapis/python-bigquery/compare/v2.27.0...v2.27.1) (2021-09-27) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 8aecf9dd2..f183b63b4 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.6, 3.7, 3.8 and 3.9 on both UNIX and Windows. + 3.6, 3.7, 3.8, 3.9 and 3.10 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests. - To run a single unit test:: - $ nox -s unit-3.9 -- -k + $ nox -s unit-3.10 -- -k .. note:: @@ -225,11 +225,13 @@ We support: - `Python 3.7`_ - `Python 3.8`_ - `Python 3.9`_ +- `Python 3.10`_ .. _Python 3.6: https://docs.python.org/3.6/ .. _Python 3.7: https://docs.python.org/3.7/ .. _Python 3.8: https://docs.python.org/3.8/ .. _Python 3.9: https://docs.python.org/3.9/ +.. _Python 3.10: https://docs.python.org/3.10/ Supported versions can be found in our ``noxfile.py`` `config`_. diff --git a/docs/conf.py b/docs/conf.py index fa5217731..3d07b6bf5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -364,10 +364,10 @@ "google-auth": ("https://googleapis.dev/python/google-auth/latest/", None), "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None,), "grpc": ("https://grpc.github.io/grpc/python/", None), - "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), - "geopandas": ("https://geopandas.org/", None), "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), + "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), + "geopandas": ("https://geopandas.org/", None), } diff --git a/docs/format_options.rst b/docs/format_options.rst new file mode 100644 index 000000000..b3948209e --- /dev/null +++ b/docs/format_options.rst @@ -0,0 +1,6 @@ +BigQuery Format Options +======================= + +.. automodule:: google.cloud.bigquery.format_options + :members: + :undoc-members: diff --git a/docs/job_base.rst b/docs/job_base.rst new file mode 100644 index 000000000..f5ef06b88 --- /dev/null +++ b/docs/job_base.rst @@ -0,0 +1,5 @@ +Common Job Resource Classes +=========================== + +.. automodule:: google.cloud.bigquery.job.base + :members: diff --git a/docs/query.rst b/docs/query.rst new file mode 100644 index 000000000..d3cb8fe83 --- /dev/null +++ b/docs/query.rst @@ -0,0 +1,5 @@ +Query Resource Classes +====================== + +.. automodule:: google.cloud.bigquery.query + :members: diff --git a/docs/reference.rst b/docs/reference.rst index 713b9239d..4f655b09e 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -47,7 +47,6 @@ Job Classes job.CopyJob job.LoadJob job.ExtractJob - job.UnknownJob Job-Related Types ----------------- @@ -68,7 +67,11 @@ Job-Related Types job.SourceFormat job.WriteDisposition job.SchemaUpdateOption - job.TransactionInfo + +.. toctree:: + :maxdepth: 2 + + job_base Dataset @@ -134,14 +137,10 @@ Schema Query ===== -.. autosummary:: - :toctree: generated +.. toctree:: + :maxdepth: 2 - query.ArrayQueryParameter - query.ScalarQueryParameter - query.ScalarQueryParameterType - query.StructQueryParameter - query.UDFResource + query Retries @@ -167,6 +166,11 @@ External Configuration external_config.CSVOptions external_config.GoogleSheetsOptions +.. toctree:: + :maxdepth: 2 + + format_options + Magics ====== diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index ec8f68af0..a30d748bb 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -49,6 +49,7 @@ from google.cloud.bigquery.external_config import CSVOptions from google.cloud.bigquery.external_config import GoogleSheetsOptions from google.cloud.bigquery.external_config import ExternalSourceFormat +from google.cloud.bigquery.format_options import AvroOptions from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery.job import Compression from google.cloud.bigquery.job import CopyJob @@ -149,6 +150,7 @@ "PolicyTagList", "UDFResource", "ExternalConfig", + "AvroOptions", "BigtableOptions", "BigtableColumnFamily", "BigtableColumn", diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index f400f9b70..f2a8f34f0 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -96,6 +96,10 @@ def installed_version(self) -> packaging.version.Version: return self._installed_version + @property + def use_compliant_nested_type(self) -> bool: + return self.installed_version.major >= 4 + BQ_STORAGE_VERSIONS = BQStorageVersions() PYARROW_VERSIONS = PyarrowVersions() diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index b034c0fd1..da2fdc811 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -89,8 +89,8 @@ def _to_wkb(v): _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", - # BigQuery does not support uploading DATETIME values from Parquet files. - # See: https://github.com/googleapis/google-cloud-python/issues/9996 + # TODO: Update to DATETIME in V3 + # https://github.com/googleapis/python-bigquery/issues/985 "datetime64[ns]": "TIMESTAMP", "float32": "FLOAT", "float64": "FLOAT", @@ -424,7 +424,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema): # column, but it was not found. if bq_schema_unused: raise ValueError( - u"bq_schema contains fields not present in dataframe: {}".format( + "bq_schema contains fields not present in dataframe: {}".format( bq_schema_unused ) ) @@ -465,7 +465,14 @@ def augment_schema(dataframe, current_bq_schema): continue arrow_table = pyarrow.array(dataframe[field.name]) - detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id) + + if pyarrow.types.is_list(arrow_table.type): + # `pyarrow.ListType` + detected_mode = "REPEATED" + detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.values.type.id) + else: + detected_mode = field.mode + detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id) if detected_type is None: unknown_type_fields.append(field) @@ -474,7 +481,7 @@ def augment_schema(dataframe, current_bq_schema): new_field = schema.SchemaField( name=field.name, field_type=detected_type, - mode=field.mode, + mode=detected_mode, description=field.description, fields=field.fields, ) @@ -482,7 +489,7 @@ def augment_schema(dataframe, current_bq_schema): if unknown_type_fields: warnings.warn( - u"Pyarrow could not determine the type of columns: {}.".format( + "Pyarrow could not determine the type of columns: {}.".format( ", ".join(field.name for field in unknown_type_fields) ) ) @@ -521,7 +528,7 @@ def dataframe_to_arrow(dataframe, bq_schema): extra_fields = bq_field_names - column_and_index_names if extra_fields: raise ValueError( - u"bq_schema contains fields not present in dataframe: {}".format( + "bq_schema contains fields not present in dataframe: {}".format( extra_fields ) ) @@ -531,7 +538,7 @@ def dataframe_to_arrow(dataframe, bq_schema): missing_fields = column_names - bq_field_names if missing_fields: raise ValueError( - u"bq_schema is missing fields from dataframe: {}".format(missing_fields) + "bq_schema is missing fields from dataframe: {}".format(missing_fields) ) arrow_arrays = [] @@ -551,7 +558,13 @@ def dataframe_to_arrow(dataframe, bq_schema): return pyarrow.Table.from_arrays(arrow_arrays, names=arrow_names) -def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SNAPPY"): +def dataframe_to_parquet( + dataframe, + bq_schema, + filepath, + parquet_compression="SNAPPY", + parquet_use_compliant_nested_type=True, +): """Write dataframe as a Parquet file, according to the desired BQ schema. This function requires the :mod:`pyarrow` package. Arrow is used as an @@ -572,10 +585,27 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN The compression codec to use by the the ``pyarrow.parquet.write_table`` serializing method. Defaults to "SNAPPY". https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table + parquet_use_compliant_nested_type (bool): + Whether the ``pyarrow.parquet.write_table`` serializing method should write + compliant Parquet nested type (lists). Defaults to ``True``. + https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types + https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table + + This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. """ + import pyarrow.parquet + + kwargs = ( + {"use_compliant_nested_type": parquet_use_compliant_nested_type} + if _helpers.PYARROW_VERSIONS.use_compliant_nested_type + else {} + ) + bq_schema = schema._to_schema_fields(bq_schema) arrow_table = dataframe_to_arrow(dataframe, bq_schema) - pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression) + pyarrow.parquet.write_table( + arrow_table, filepath, compression=parquet_compression, **kwargs, + ) def _row_iterator_page_to_arrow(page, column_names, arrow_types): diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index a738dd0f3..9f3a4f972 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -89,6 +89,8 @@ from google.cloud.bigquery.table import TableListItem from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import RowIterator +from google.cloud.bigquery.format_options import ParquetOptions +from google.cloud.bigquery import _helpers _DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB @@ -518,7 +520,7 @@ def _dataset_from_arg(self, dataset): def create_dataset( self, - dataset: Union[str, Dataset, DatasetReference], + dataset: Union[str, Dataset, DatasetReference, DatasetListItem], exists_ok: bool = False, retry: retries.Retry = DEFAULT_RETRY, timeout: float = DEFAULT_TIMEOUT, @@ -648,7 +650,7 @@ def create_routine( def create_table( self, - table: Union[str, Table, TableReference], + table: Union[str, Table, TableReference, TableListItem], exists_ok: bool = False, retry: retries.Retry = DEFAULT_RETRY, timeout: float = DEFAULT_TIMEOUT, @@ -662,6 +664,7 @@ def create_table( table (Union[ \ google.cloud.bigquery.table.Table, \ google.cloud.bigquery.table.TableReference, \ + google.cloud.bigquery.table.TableListItem, \ str, \ ]): A :class:`~google.cloud.bigquery.table.Table` to create. @@ -1264,7 +1267,7 @@ def update_table( def list_models( self, - dataset: Union[Dataset, DatasetReference, str], + dataset: Union[Dataset, DatasetReference, DatasetListItem, str], max_results: int = None, page_token: str = None, retry: retries.Retry = DEFAULT_RETRY, @@ -1341,7 +1344,7 @@ def api_request(*args, **kwargs): def list_routines( self, - dataset: Union[Dataset, DatasetReference, str], + dataset: Union[Dataset, DatasetReference, DatasetListItem, str], max_results: int = None, page_token: str = None, retry: retries.Retry = DEFAULT_RETRY, @@ -1418,7 +1421,7 @@ def api_request(*args, **kwargs): def list_tables( self, - dataset: Union[Dataset, DatasetReference, str], + dataset: Union[Dataset, DatasetReference, DatasetListItem, str], max_results: int = None, page_token: str = None, retry: retries.Retry = DEFAULT_RETRY, @@ -1494,7 +1497,7 @@ def api_request(*args, **kwargs): def delete_dataset( self, - dataset: Union[Dataset, DatasetReference, str], + dataset: Union[Dataset, DatasetReference, DatasetListItem, str], delete_contents: bool = False, retry: retries.Retry = DEFAULT_RETRY, timeout: float = DEFAULT_TIMEOUT, @@ -2430,10 +2433,10 @@ def load_table_from_dataframe( They are supported when using the PARQUET source format, but due to the way they are encoded in the ``parquet`` file, a mismatch with the existing table schema can occur, so - 100% compatibility cannot be guaranteed for REPEATED fields when + REPEATED fields are not properly supported when using ``pyarrow<4.0.0`` using the parquet format. - https://github.com/googleapis/python-bigquery/issues/17 + https://github.com/googleapis/python-bigquery/issues/19 Args: dataframe (pandas.DataFrame): @@ -2480,18 +2483,18 @@ def load_table_from_dataframe( :attr:`~google.cloud.bigquery.job.SourceFormat.PARQUET` are supported. parquet_compression (Optional[str]): - [Beta] The compression method to use if intermittently - serializing ``dataframe`` to a parquet file. - - The argument is directly passed as the ``compression`` - argument to the underlying ``pyarrow.parquet.write_table()`` - method (the default value "snappy" gets converted to uppercase). - https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table - - If the job config schema is missing, the argument is directly - passed as the ``compression`` argument to the underlying - ``DataFrame.to_parquet()`` method. - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet + [Beta] The compression method to use if intermittently + serializing ``dataframe`` to a parquet file. + + The argument is directly passed as the ``compression`` + argument to the underlying ``pyarrow.parquet.write_table()`` + method (the default value "snappy" gets converted to uppercase). + https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table + + If the job config schema is missing, the argument is directly + passed as the ``compression`` argument to the underlying + ``DataFrame.to_parquet()`` method. + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet timeout (Optional[float]): The number of seconds to wait for the underlying HTTP transport before using ``retry``. @@ -2520,6 +2523,16 @@ def load_table_from_dataframe( if job_config.source_format is None: # default value job_config.source_format = job.SourceFormat.PARQUET + + if ( + job_config.source_format == job.SourceFormat.PARQUET + and job_config.parquet_options is None + ): + parquet_options = ParquetOptions() + # default value + parquet_options.enable_list_inference = True + job_config.parquet_options = parquet_options + if job_config.source_format not in supported_formats: raise ValueError( "Got unexpected source_format: '{}'. Currently, only PARQUET and CSV are supported".format( @@ -2591,9 +2604,19 @@ def load_table_from_dataframe( job_config.schema, tmppath, parquet_compression=parquet_compression, + parquet_use_compliant_nested_type=True, ) else: - dataframe.to_parquet(tmppath, compression=parquet_compression) + dataframe.to_parquet( + tmppath, + engine="pyarrow", + compression=parquet_compression, + **( + {"use_compliant_nested_type": True} + if _helpers.PYARROW_VERSIONS.use_compliant_nested_type + else {} + ), + ) else: diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py index f1692ba50..5f284c639 100644 --- a/google/cloud/bigquery/external_config.py +++ b/google/cloud/bigquery/external_config.py @@ -22,13 +22,13 @@ import base64 import copy -from typing import FrozenSet, Iterable, Optional +from typing import FrozenSet, Iterable, Optional, Union from google.cloud.bigquery._helpers import _to_bytes from google.cloud.bigquery._helpers import _bytes_to_json from google.cloud.bigquery._helpers import _int_or_none from google.cloud.bigquery._helpers import _str_or_none -from google.cloud.bigquery.format_options import ParquetOptions +from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions from google.cloud.bigquery.schema import SchemaField @@ -548,7 +548,13 @@ def from_api_repr(cls, resource: dict) -> "GoogleSheetsOptions": return config -_OPTION_CLASSES = (BigtableOptions, CSVOptions, GoogleSheetsOptions, ParquetOptions) +_OPTION_CLASSES = ( + AvroOptions, + BigtableOptions, + CSVOptions, + GoogleSheetsOptions, + ParquetOptions, +) class HivePartitioningOptions(object): @@ -646,11 +652,6 @@ class ExternalConfig(object): def __init__(self, source_format): self._properties = {"sourceFormat": source_format} - self._options = None - for optcls in _OPTION_CLASSES: - if source_format == optcls._SOURCE_FORMAT: - self._options = optcls() - break @property def source_format(self): @@ -663,9 +664,17 @@ def source_format(self): return self._properties["sourceFormat"] @property - def options(self): - """Optional[Dict[str, Any]]: Source-specific options.""" - return self._options + def options(self) -> Optional[Union[_OPTION_CLASSES]]: + """Source-specific options.""" + for optcls in _OPTION_CLASSES: + if self.source_format == optcls._SOURCE_FORMAT: + options = optcls() + self._properties.setdefault(optcls._RESOURCE_NAME, {}) + options._properties = self._properties[optcls._RESOURCE_NAME] + return options + + # No matching source format found. + return None @property def autodetect(self): @@ -815,23 +824,120 @@ def schema(self, value): self._properties["schema"] = prop @property - def parquet_options(self): - """Optional[google.cloud.bigquery.format_options.ParquetOptions]: Additional - properties to set if ``sourceFormat`` is set to PARQUET. + def avro_options(self) -> Optional[AvroOptions]: + """Additional properties to set if ``sourceFormat`` is set to AVRO. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.avro_options + """ + if self.source_format == ExternalSourceFormat.AVRO: + self._properties.setdefault(AvroOptions._RESOURCE_NAME, {}) + resource = self._properties.get(AvroOptions._RESOURCE_NAME) + if resource is None: + return None + options = AvroOptions() + options._properties = resource + return options + + @avro_options.setter + def avro_options(self, value): + if self.source_format != ExternalSourceFormat.AVRO: + msg = f"Cannot set Avro options, source format is {self.source_format}" + raise TypeError(msg) + self._properties[AvroOptions._RESOURCE_NAME] = value._properties + + @property + def bigtable_options(self) -> Optional[BigtableOptions]: + """Additional properties to set if ``sourceFormat`` is set to BIGTABLE. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.bigtable_options + """ + if self.source_format == ExternalSourceFormat.BIGTABLE: + self._properties.setdefault(BigtableOptions._RESOURCE_NAME, {}) + resource = self._properties.get(BigtableOptions._RESOURCE_NAME) + if resource is None: + return None + options = BigtableOptions() + options._properties = resource + return options + + @bigtable_options.setter + def bigtable_options(self, value): + if self.source_format != ExternalSourceFormat.BIGTABLE: + msg = f"Cannot set Bigtable options, source format is {self.source_format}" + raise TypeError(msg) + self._properties[BigtableOptions._RESOURCE_NAME] = value._properties + + @property + def csv_options(self) -> Optional[CSVOptions]: + """Additional properties to set if ``sourceFormat`` is set to CSV. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.csv_options + """ + if self.source_format == ExternalSourceFormat.CSV: + self._properties.setdefault(CSVOptions._RESOURCE_NAME, {}) + resource = self._properties.get(CSVOptions._RESOURCE_NAME) + if resource is None: + return None + options = CSVOptions() + options._properties = resource + return options + + @csv_options.setter + def csv_options(self, value): + if self.source_format != ExternalSourceFormat.CSV: + msg = f"Cannot set CSV options, source format is {self.source_format}" + raise TypeError(msg) + self._properties[CSVOptions._RESOURCE_NAME] = value._properties + + @property + def google_sheets_options(self) -> Optional[GoogleSheetsOptions]: + """Additional properties to set if ``sourceFormat`` is set to + GOOGLE_SHEETS. + + See: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.google_sheets_options + """ + if self.source_format == ExternalSourceFormat.GOOGLE_SHEETS: + self._properties.setdefault(GoogleSheetsOptions._RESOURCE_NAME, {}) + resource = self._properties.get(GoogleSheetsOptions._RESOURCE_NAME) + if resource is None: + return None + options = GoogleSheetsOptions() + options._properties = resource + return options + + @google_sheets_options.setter + def google_sheets_options(self, value): + if self.source_format != ExternalSourceFormat.GOOGLE_SHEETS: + msg = f"Cannot set Google Sheets options, source format is {self.source_format}" + raise TypeError(msg) + self._properties[GoogleSheetsOptions._RESOURCE_NAME] = value._properties + + @property + def parquet_options(self) -> Optional[ParquetOptions]: + """Additional properties to set if ``sourceFormat`` is set to PARQUET. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.parquet_options """ - if self.source_format != ExternalSourceFormat.PARQUET: + if self.source_format == ExternalSourceFormat.PARQUET: + self._properties.setdefault(ParquetOptions._RESOURCE_NAME, {}) + resource = self._properties.get(ParquetOptions._RESOURCE_NAME) + if resource is None: return None - return self._options + options = ParquetOptions() + options._properties = resource + return options @parquet_options.setter def parquet_options(self, value): if self.source_format != ExternalSourceFormat.PARQUET: msg = f"Cannot set Parquet options, source format is {self.source_format}" raise TypeError(msg) - self._options = value + self._properties[ParquetOptions._RESOURCE_NAME] = value._properties def to_api_repr(self) -> dict: """Build an API representation of this object. @@ -841,10 +947,6 @@ def to_api_repr(self) -> dict: A dictionary in the format used by the BigQuery API. """ config = copy.deepcopy(self._properties) - if self.options is not None: - r = self.options.to_api_repr() - if r != {}: - config[self.options._RESOURCE_NAME] = r return config @classmethod @@ -862,10 +964,5 @@ def from_api_repr(cls, resource: dict) -> "ExternalConfig": ExternalConfig: Configuration parsed from ``resource``. """ config = cls(resource["sourceFormat"]) - for optcls in _OPTION_CLASSES: - opts = resource.get(optcls._RESOURCE_NAME) - if opts is not None: - config._options = optcls.from_api_repr(opts) - break config._properties = copy.deepcopy(resource) return config diff --git a/google/cloud/bigquery/format_options.py b/google/cloud/bigquery/format_options.py index 2c9a2ce20..1208565a9 100644 --- a/google/cloud/bigquery/format_options.py +++ b/google/cloud/bigquery/format_options.py @@ -13,7 +13,59 @@ # limitations under the License. import copy -from typing import Dict +from typing import Dict, Optional + + +class AvroOptions: + """Options if source format is set to AVRO.""" + + _SOURCE_FORMAT = "AVRO" + _RESOURCE_NAME = "avroOptions" + + def __init__(self): + self._properties = {} + + @property + def use_avro_logical_types(self) -> Optional[bool]: + """[Optional] If sourceFormat is set to 'AVRO', indicates whether to + interpret logical types as the corresponding BigQuery data type (for + example, TIMESTAMP), instead of using the raw type (for example, + INTEGER). + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#AvroOptions.FIELDS.use_avro_logical_types + """ + return self._properties.get("useAvroLogicalTypes") + + @use_avro_logical_types.setter + def use_avro_logical_types(self, value): + self._properties["useAvroLogicalTypes"] = value + + @classmethod + def from_api_repr(cls, resource: Dict[str, bool]) -> "AvroOptions": + """Factory: construct an instance from a resource dict. + + Args: + resource (Dict[str, bool]): + Definition of a :class:`~.format_options.AvroOptions` instance in + the same representation as is returned from the API. + + Returns: + :class:`~.format_options.AvroOptions`: + Configuration parsed from ``resource``. + """ + config = cls() + config._properties = copy.deepcopy(resource) + return config + + def to_api_repr(self) -> dict: + """Build an API representation of this object. + + Returns: + Dict[str, bool]: + A dictionary in the format used by the BigQuery API. + """ + return copy.deepcopy(self._properties) class ParquetOptions: diff --git a/google/cloud/bigquery/job/base.py b/google/cloud/bigquery/job/base.py index 698181092..9e381ded6 100644 --- a/google/cloud/bigquery/job/base.py +++ b/google/cloud/bigquery/job/base.py @@ -19,7 +19,7 @@ import http import threading import typing -from typing import Dict, Optional +from typing import Dict, Optional, Sequence from google.api_core import exceptions import google.api_core.future.polling @@ -193,7 +193,8 @@ def parent_job_id(self): return _helpers._get_sub_prop(self._properties, ["statistics", "parentJobId"]) @property - def script_statistics(self): + def script_statistics(self) -> Optional["ScriptStatistics"]: + """Statistics for a child job of a script.""" resource = _helpers._get_sub_prop( self._properties, ["statistics", "scriptStatistics"] ) @@ -968,9 +969,8 @@ def __init__(self, resource): self._properties = resource @property - def stack_frames(self): - """List[ScriptStackFrame]: Stack trace where the current evaluation - happened. + def stack_frames(self) -> Sequence[ScriptStackFrame]: + """Stack trace where the current evaluation happened. Shows line/column/procedure name of each frame on the stack at the point where the current evaluation happened. @@ -982,7 +982,7 @@ def stack_frames(self): ] @property - def evaluation_kind(self): + def evaluation_kind(self) -> Optional[str]: """str: Indicates the type of child job. Possible values include ``STATEMENT`` and ``EXPRESSION``. @@ -1005,7 +1005,9 @@ def from_api_repr(cls, resource: dict, client) -> "UnknownJob": Returns: UnknownJob: Job corresponding to the resource. """ - job_ref_properties = resource.get("jobReference", {"projectId": client.project}) + job_ref_properties = resource.get( + "jobReference", {"projectId": client.project, "jobId": None} + ) job_ref = _JobReference._from_api_repr(job_ref_properties) job = cls(job_ref, client) # Populate the job reference with the project, even if it has been diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index c07daec99..6a973bb65 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -18,7 +18,7 @@ import copy import re import typing -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union from google.api_core import exceptions from google.api_core.future import polling as polling_future @@ -38,6 +38,7 @@ from google.cloud.bigquery.query import UDFResource from google.cloud.bigquery.retry import DEFAULT_RETRY, DEFAULT_JOB_RETRY from google.cloud.bigquery.routine import RoutineReference +from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import _EmptyRowIterator from google.cloud.bigquery.table import RangePartitioning from google.cloud.bigquery.table import _table_arg_to_table_ref @@ -57,6 +58,7 @@ import pyarrow from google.api_core import retry as retries from google.cloud import bigquery_storage + from google.cloud.bigquery.client import Client from google.cloud.bigquery.table import RowIterator @@ -853,7 +855,7 @@ def to_api_repr(self): } @classmethod - def from_api_repr(cls, resource: dict, client) -> "QueryJob": + def from_api_repr(cls, resource: dict, client: "Client") -> "QueryJob": """Factory: construct a job given its API representation Args: @@ -866,8 +868,10 @@ def from_api_repr(cls, resource: dict, client) -> "QueryJob": Returns: google.cloud.bigquery.job.QueryJob: Job parsed from ``resource``. """ - cls._check_resource_config(resource) - job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job_ref_properties = resource.setdefault( + "jobReference", {"projectId": client.project, "jobId": None} + ) + job_ref = _JobReference._from_api_repr(job_ref_properties) job = cls(job_ref, None, client=client) job._set_properties(resource) return job @@ -887,6 +891,18 @@ def query_plan(self): plan_entries = self._job_statistics().get("queryPlan", ()) return [QueryPlanEntry.from_api_repr(entry) for entry in plan_entries] + @property + def schema(self) -> Optional[List[SchemaField]]: + """The schema of the results. + + Present only for successful dry run of non-legacy SQL queries. + """ + resource = self._job_statistics().get("schema") + if resource is None: + return None + fields = resource.get("fields", []) + return [SchemaField.from_api_repr(field) for field in fields] + @property def timeline(self): """List(TimelineEntry): Return the query execution timeline @@ -1318,6 +1334,8 @@ def result( If Non-``None`` and non-default ``job_retry`` is provided and the job is not retryable. """ + if self.dry_run: + return _EmptyRowIterator() try: retry_do_query = getattr(self, "_retry_do_query", None) if retry_do_query is not None: diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py index f221e65a8..91311d332 100644 --- a/google/cloud/bigquery/schema.py +++ b/google/cloud/bigquery/schema.py @@ -66,6 +66,22 @@ class _DefaultSentinel(enum.Enum): _DEFAULT_VALUE = _DefaultSentinel.DEFAULT_VALUE +class _DefaultSentinel(enum.Enum): + """Object used as 'sentinel' indicating default value should be used. + + Uses enum so that pytype/mypy knows that this is the only possible value. + https://stackoverflow.com/a/60605919/101923 + + Literal[_DEFAULT_VALUE] is an alternative, but only added in Python 3.8. + https://docs.python.org/3/library/typing.html#typing.Literal + """ + + DEFAULT_VALUE = object() + + +_DEFAULT_VALUE = _DefaultSentinel.DEFAULT_VALUE + + class SchemaField(object): """Describe a single field within a table schema. diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 75901afb4..376323801 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -180,10 +180,8 @@ class TableReference(_TableBase): https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablereference Args: - dataset_ref: - A pointer to the dataset - table_id: - The ID of the table + dataset_ref: A pointer to the dataset + table_id: The ID of the table """ _PROPERTY_TO_API_FIELD = { diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index 3e5c77ede..967959b05 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.27.1" +__version__ = "2.28.1" diff --git a/google/cloud/bigquery_v2/types/encryption_config.py b/google/cloud/bigquery_v2/types/encryption_config.py index 4b9139733..a95954a30 100644 --- a/google/cloud/bigquery_v2/types/encryption_config.py +++ b/google/cloud/bigquery_v2/types/encryption_config.py @@ -25,6 +25,7 @@ class EncryptionConfiguration(proto.Message): r""" + Attributes: kms_key_name (google.protobuf.wrappers_pb2.StringValue): Optional. Describes the Cloud KMS encryption diff --git a/google/cloud/bigquery_v2/types/model.py b/google/cloud/bigquery_v2/types/model.py index 706418401..6e3ca0095 100644 --- a/google/cloud/bigquery_v2/types/model.py +++ b/google/cloud/bigquery_v2/types/model.py @@ -38,6 +38,7 @@ class Model(proto.Message): r""" + Attributes: etag (str): Output only. A hash of this resource. @@ -251,7 +252,8 @@ class FeedbackType(proto.Enum): EXPLICIT = 2 class SeasonalPeriod(proto.Message): - r""" """ + r""" + """ class SeasonalPeriodType(proto.Enum): r"""""" @@ -264,7 +266,8 @@ class SeasonalPeriodType(proto.Enum): YEARLY = 6 class KmeansEnums(proto.Message): - r""" """ + r""" + """ class KmeansInitializationMethod(proto.Enum): r"""Indicates the method used to initialize the centroids for @@ -386,6 +389,7 @@ class BinaryClassificationMetrics(proto.Message): class BinaryConfusionMatrix(proto.Message): r"""Confusion matrix for binary classification models. + Attributes: positive_class_threshold (google.protobuf.wrappers_pb2.DoubleValue): Threshold value used when computing each of @@ -464,6 +468,7 @@ class MultiClassClassificationMetrics(proto.Message): class ConfusionMatrix(proto.Message): r"""Confusion matrix for multi-class classification models. + Attributes: confidence_threshold (google.protobuf.wrappers_pb2.DoubleValue): Confidence threshold used when computing the @@ -474,6 +479,7 @@ class ConfusionMatrix(proto.Message): class Entry(proto.Message): r"""A single entry in the confusion matrix. + Attributes: predicted_label (str): The predicted label. For confidence_threshold > 0, we will @@ -491,6 +497,7 @@ class Entry(proto.Message): class Row(proto.Message): r"""A single row in the confusion matrix. + Attributes: actual_label (str): The original label of this row. @@ -525,6 +532,7 @@ class Row(proto.Message): class ClusteringMetrics(proto.Message): r"""Evaluation metrics for clustering models. + Attributes: davies_bouldin_index (google.protobuf.wrappers_pb2.DoubleValue): Davies-Bouldin index. @@ -537,6 +545,7 @@ class ClusteringMetrics(proto.Message): class Cluster(proto.Message): r"""Message containing the information about one cluster. + Attributes: centroid_id (int): Centroid id. @@ -550,6 +559,7 @@ class Cluster(proto.Message): class FeatureValue(proto.Message): r"""Representative value of a single feature within the cluster. + Attributes: feature_column (str): The feature column name. @@ -562,6 +572,7 @@ class FeatureValue(proto.Message): class CategoricalValue(proto.Message): r"""Representative value of a categorical feature. + Attributes: category_counts (Sequence[google.cloud.bigquery_v2.types.Model.ClusteringMetrics.Cluster.FeatureValue.CategoricalValue.CategoryCount]): Counts of all categories for the categorical feature. If @@ -573,6 +584,7 @@ class CategoricalValue(proto.Message): class CategoryCount(proto.Message): r"""Represents the count of a single category within the cluster. + Attributes: category (str): The name of category. @@ -668,6 +680,7 @@ class RankingMetrics(proto.Message): class ArimaForecastingMetrics(proto.Message): r"""Model evaluation metrics for ARIMA forecasting models. + Attributes: non_seasonal_order (Sequence[google.cloud.bigquery_v2.types.Model.ArimaOrder]): Non-seasonal order. @@ -857,6 +870,7 @@ class ArimaOrder(proto.Message): class ArimaFittingMetrics(proto.Message): r"""ARIMA model fitting metrics. + Attributes: log_likelihood (float): Log-likelihood. @@ -888,6 +902,7 @@ class GlobalExplanation(proto.Message): class Explanation(proto.Message): r"""Explanation for a single feature. + Attributes: feature_name (str): Full name of the feature. For non-numerical features, will @@ -910,6 +925,7 @@ class Explanation(proto.Message): class TrainingRun(proto.Message): r"""Information about a single training query run for the model. + Attributes: training_options (google.cloud.bigquery_v2.types.Model.TrainingRun.TrainingOptions): Options that were used for this training run, @@ -935,6 +951,7 @@ class TrainingRun(proto.Message): class TrainingOptions(proto.Message): r"""Options used in model training. + Attributes: max_iterations (int): The maximum number of iterations in training. @@ -1182,6 +1199,7 @@ class TrainingOptions(proto.Message): class IterationResult(proto.Message): r"""Information about a single iteration of the training run. + Attributes: index (google.protobuf.wrappers_pb2.Int32Value): Index of the iteration, 0 based. @@ -1205,6 +1223,7 @@ class IterationResult(proto.Message): class ClusterInfo(proto.Message): r"""Information about a single cluster for clustering model. + Attributes: centroid_id (int): Centroid id. @@ -1241,6 +1260,7 @@ class ArimaResult(proto.Message): class ArimaCoefficients(proto.Message): r"""Arima coefficients. + Attributes: auto_regressive_coefficients (Sequence[float]): Auto-regressive coefficients, an array of @@ -1263,6 +1283,7 @@ class ArimaCoefficients(proto.Message): class ArimaModelInfo(proto.Message): r"""Arima model information. + Attributes: non_seasonal_order (google.cloud.bigquery_v2.types.Model.ArimaOrder): Non-seasonal order. @@ -1409,6 +1430,7 @@ class ArimaModelInfo(proto.Message): class GetModelRequest(proto.Message): r""" + Attributes: project_id (str): Required. Project ID of the requested model. @@ -1425,6 +1447,7 @@ class GetModelRequest(proto.Message): class PatchModelRequest(proto.Message): r""" + Attributes: project_id (str): Required. Project ID of the model to patch. @@ -1447,6 +1470,7 @@ class PatchModelRequest(proto.Message): class DeleteModelRequest(proto.Message): r""" + Attributes: project_id (str): Required. Project ID of the model to delete. @@ -1463,6 +1487,7 @@ class DeleteModelRequest(proto.Message): class ListModelsRequest(proto.Message): r""" + Attributes: project_id (str): Required. Project ID of the models to list. @@ -1487,6 +1512,7 @@ class ListModelsRequest(proto.Message): class ListModelsResponse(proto.Message): r""" + Attributes: models (Sequence[google.cloud.bigquery_v2.types.Model]): Models in the requested dataset. Only the following fields diff --git a/google/cloud/bigquery_v2/types/model_reference.py b/google/cloud/bigquery_v2/types/model_reference.py index a9ebad613..544377f61 100644 --- a/google/cloud/bigquery_v2/types/model_reference.py +++ b/google/cloud/bigquery_v2/types/model_reference.py @@ -23,6 +23,7 @@ class ModelReference(proto.Message): r"""Id path of a model. + Attributes: project_id (str): Required. The ID of the project containing diff --git a/google/cloud/bigquery_v2/types/standard_sql.py b/google/cloud/bigquery_v2/types/standard_sql.py index 7a845fc48..69a221c3c 100644 --- a/google/cloud/bigquery_v2/types/standard_sql.py +++ b/google/cloud/bigquery_v2/types/standard_sql.py @@ -78,6 +78,7 @@ class TypeKind(proto.Enum): class StandardSqlField(proto.Message): r"""A field or a column. + Attributes: name (str): Optional. The name of this field. Can be @@ -96,6 +97,7 @@ class StandardSqlField(proto.Message): class StandardSqlStructType(proto.Message): r""" + Attributes: fields (Sequence[google.cloud.bigquery_v2.types.StandardSqlField]): @@ -106,6 +108,7 @@ class StandardSqlStructType(proto.Message): class StandardSqlTableType(proto.Message): r"""A table type + Attributes: columns (Sequence[google.cloud.bigquery_v2.types.StandardSqlField]): The columns in this table type diff --git a/google/cloud/bigquery_v2/types/table_reference.py b/google/cloud/bigquery_v2/types/table_reference.py index d56e5b09f..da206b4d7 100644 --- a/google/cloud/bigquery_v2/types/table_reference.py +++ b/google/cloud/bigquery_v2/types/table_reference.py @@ -23,6 +23,7 @@ class TableReference(proto.Message): r""" + Attributes: project_id (str): Required. The ID of the project containing diff --git a/owlbot.py b/owlbot.py index 86374858e..5fd5c436a 100644 --- a/owlbot.py +++ b/owlbot.py @@ -32,8 +32,6 @@ intersphinx_dependencies={ "pandas": "http://pandas.pydata.org/pandas-docs/stable/", "geopandas": "https://geopandas.org/", - "proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None), - "protobuf": ("https://googleapis.dev/python/protobuf/latest/", None), }, ) @@ -43,6 +41,7 @@ excludes=[ "noxfile.py", "docs/multiprocessing.rst", + "docs/index.rst", ".coveragerc", ".github/CODEOWNERS", # Include custom SNIPPETS_TESTS job for performance. @@ -55,10 +54,6 @@ ], ) -# Remove unneeded intersphinx links, the library does not use any proto-generated code. -s.replace("docs/conf.py", r'\s+"(proto-plus|protobuf)":.*$', "") - - # ---------------------------------------------------------------------------- # Samples templates # ---------------------------------------------------------------------------- diff --git a/samples/geography/noxfile.py b/samples/geography/noxfile.py index b008613f0..93a9122cc 100644 --- a/samples/geography/noxfile.py +++ b/samples/geography/noxfile.py @@ -87,7 +87,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9"] +ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] @@ -98,6 +98,10 @@ def get_pytest_env_vars() -> Dict[str, str]: "True", "true", ) + +# Error if a python version is missing +nox.options.error_on_missing_interpreters = True + # # Style Checks # diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 46162762c..ecd428ab9 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -1,5 +1,5 @@ attrs==21.2.0 -cachetools==4.2.2 +cachetools==4.2.4 certifi==2021.5.30 cffi==1.14.6 charset-normalizer==2.0.6 @@ -11,7 +11,7 @@ Fiona==1.8.20 geojson==2.5.0 geopandas==0.9.0 google-api-core==2.0.1 -google-auth==2.2.0 +google-auth==2.2.1 google-cloud-bigquery==2.27.1 google-cloud-bigquery-storage==2.9.0 google-cloud-core==2.0.0 @@ -29,6 +29,8 @@ numpy==1.21.2; python_version > "3.6" packaging==21.0 pandas==1.1.5; python_version < '3.7' pandas==1.3.2; python_version >= '3.7' +proto-plus==1.19.2 +protobuf==3.18.0 pyarrow==5.0.0 pyasn1==0.4.8 pyasn1-modules==0.2.8 @@ -46,4 +48,4 @@ six==1.16.0 typing-extensions==3.10.0.2 typing-inspect==0.7.1 urllib3==1.26.7 -zipp==3.5.0 +zipp==3.6.0 diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index b008613f0..93a9122cc 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -87,7 +87,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9"] +ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] @@ -98,6 +98,10 @@ def get_pytest_env_vars() -> Dict[str, str]: "True", "true", ) + +# Error if a python version is missing +nox.options.error_on_missing_interpreters = True + # # Style Checks # diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index cbf4dff27..39ea3e878 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -24,6 +24,7 @@ import google.api_core.retry import pkg_resources import pytest +import numpy from google.cloud import bigquery from google.cloud import bigquery_storage @@ -83,6 +84,81 @@ def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_i ("uint8_col", pandas.Series([0, 1, 2], dtype="uint8")), ("uint16_col", pandas.Series([3, 4, 5], dtype="uint16")), ("uint32_col", pandas.Series([6, 7, 8], dtype="uint32")), + ("array_bool_col", pandas.Series([[True], [False], [True]])), + ( + "array_ts_col", + pandas.Series( + [ + [ + datetime.datetime( + 2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc + ), + ], + [ + datetime.datetime( + 2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc + ), + ], + [ + datetime.datetime( + 2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc + ), + ], + ], + ), + ), + ( + "array_dt_col", + pandas.Series( + [ + [datetime.datetime(2010, 1, 2, 3, 44, 50)], + [datetime.datetime(2011, 2, 3, 14, 50, 59)], + [datetime.datetime(2012, 3, 14, 15, 16)], + ], + ), + ), + ( + "array_float32_col", + pandas.Series( + [numpy.array([_], dtype="float32") for _ in [1.0, 2.0, 3.0]] + ), + ), + ( + "array_float64_col", + pandas.Series( + [numpy.array([_], dtype="float64") for _ in [4.0, 5.0, 6.0]] + ), + ), + ( + "array_int8_col", + pandas.Series( + [numpy.array([_], dtype="int8") for _ in [-12, -11, -10]] + ), + ), + ( + "array_int16_col", + pandas.Series([numpy.array([_], dtype="int16") for _ in [-9, -8, -7]]), + ), + ( + "array_int32_col", + pandas.Series([numpy.array([_], dtype="int32") for _ in [-6, -5, -4]]), + ), + ( + "array_int64_col", + pandas.Series([numpy.array([_], dtype="int64") for _ in [-3, -2, -1]]), + ), + ( + "array_uint8_col", + pandas.Series([numpy.array([_], dtype="uint8") for _ in [0, 1, 2]]), + ), + ( + "array_uint16_col", + pandas.Series([numpy.array([_], dtype="uint16") for _ in [3, 4, 5]]), + ), + ( + "array_uint32_col", + pandas.Series([numpy.array([_], dtype="uint32") for _ in [6, 7, 8]]), + ), ] ) dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) @@ -98,9 +174,8 @@ def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_i assert tuple(table.schema) == ( bigquery.SchemaField("bool_col", "BOOLEAN"), bigquery.SchemaField("ts_col", "TIMESTAMP"), - # BigQuery does not support uploading DATETIME values from - # Parquet files. See: - # https://github.com/googleapis/google-cloud-python/issues/9996 + # TODO: Update to DATETIME in V3 + # https://github.com/googleapis/python-bigquery/issues/985 bigquery.SchemaField("dt_col", "TIMESTAMP"), bigquery.SchemaField("float32_col", "FLOAT"), bigquery.SchemaField("float64_col", "FLOAT"), @@ -111,6 +186,20 @@ def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_i bigquery.SchemaField("uint8_col", "INTEGER"), bigquery.SchemaField("uint16_col", "INTEGER"), bigquery.SchemaField("uint32_col", "INTEGER"), + bigquery.SchemaField("array_bool_col", "BOOLEAN", mode="REPEATED"), + bigquery.SchemaField("array_ts_col", "TIMESTAMP", mode="REPEATED"), + # TODO: Update to DATETIME in V3 + # https://github.com/googleapis/python-bigquery/issues/985 + bigquery.SchemaField("array_dt_col", "TIMESTAMP", mode="REPEATED"), + bigquery.SchemaField("array_float32_col", "FLOAT", mode="REPEATED"), + bigquery.SchemaField("array_float64_col", "FLOAT", mode="REPEATED"), + bigquery.SchemaField("array_int8_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_int16_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_int32_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_int64_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_uint8_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_uint16_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_uint32_col", "INTEGER", mode="REPEATED"), ) assert table.num_rows == 3 diff --git a/tests/system/test_query.py b/tests/system/test_query.py new file mode 100644 index 000000000..24758595b --- /dev/null +++ b/tests/system/test_query.py @@ -0,0 +1,29 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import bigquery + + +def test_dry_run(bigquery_client: bigquery.Client, scalars_table: str): + query_config = bigquery.QueryJobConfig() + query_config.dry_run = True + + query_string = f"SELECT * FROM {scalars_table}" + query_job = bigquery_client.query(query_string, job_config=query_config,) + + # Note: `query_job.result()` is not necessary on a dry run query. All + # necessary information is returned in the initial response. + assert query_job.dry_run is True + assert query_job.total_bytes_processed > 0 + assert len(query_job.schema) > 0 diff --git a/tests/unit/job/test_query.py b/tests/unit/job/test_query.py index 4c598d797..17baacf5b 100644 --- a/tests/unit/job/test_query.py +++ b/tests/unit/job/test_query.py @@ -26,6 +26,7 @@ from google.cloud.bigquery.client import _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS import google.cloud.bigquery.query +from google.cloud.bigquery.table import _EmptyRowIterator from ..helpers import make_connection @@ -268,25 +269,6 @@ def test_ctor_w_query_parameters(self): job = self._make_one(self.JOB_ID, self.QUERY, client, job_config=config) self.assertEqual(job.query_parameters, query_parameters) - def test_from_api_repr_missing_identity(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = {} - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - - def test_from_api_repr_missing_config(self): - self._setUpConstants() - client = _make_client(project=self.PROJECT) - RESOURCE = { - "id": "%s:%s" % (self.PROJECT, self.DS_ID), - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - } - klass = self._get_target_class() - with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) - def test_from_api_repr_bare(self): self._setUpConstants() client = _make_client(project=self.PROJECT) @@ -989,6 +971,19 @@ def test_result(self): [query_results_call, query_results_call, reload_call, query_page_call] ) + def test_result_dry_run(self): + job_resource = self._make_resource(started=True, location="EU") + job_resource["configuration"]["dryRun"] = True + conn = make_connection() + client = _make_client(self.PROJECT, connection=conn) + job = self._get_target_class().from_api_repr(job_resource, client) + + result = job.result() + + calls = conn.api_request.mock_calls + self.assertIsInstance(result, _EmptyRowIterator) + self.assertEqual(calls, []) + def test_result_with_done_job_calls_get_query_results(self): query_resource_done = { "jobComplete": True, @@ -1391,6 +1386,43 @@ def test_result_transport_timeout_error(self): with call_api_patch, self.assertRaises(concurrent.futures.TimeoutError): job.result(timeout=1) + def test_no_schema(self): + client = _make_client(project=self.PROJECT) + resource = {} + klass = self._get_target_class() + job = klass.from_api_repr(resource, client=client) + assert job.schema is None + + def test_schema(self): + client = _make_client(project=self.PROJECT) + resource = { + "statistics": { + "query": { + "schema": { + "fields": [ + {"mode": "NULLABLE", "name": "bool_col", "type": "BOOLEAN"}, + { + "mode": "NULLABLE", + "name": "string_col", + "type": "STRING", + }, + { + "mode": "NULLABLE", + "name": "timestamp_col", + "type": "TIMESTAMP", + }, + ] + }, + }, + }, + } + klass = self._get_target_class() + job = klass.from_api_repr(resource, client=client) + assert len(job.schema) == 3 + assert job.schema[0].field_type == "BOOLEAN" + assert job.schema[1].field_type == "STRING" + assert job.schema[2].field_type == "TIMESTAMP" + def test__begin_error(self): from google.cloud import exceptions diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 2ddf98077..2504b2838 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -53,6 +53,7 @@ from google.cloud import bigquery_storage from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.retry import DEFAULT_TIMEOUT +from google.cloud.bigquery import ParquetOptions from tests.unit.helpers import make_connection @@ -6850,6 +6851,176 @@ def test_load_table_from_dataframe_w_custom_job_config_w_source_format(self): # the original config object should not have been modified assert job_config.to_api_repr() == original_config_copy.to_api_repr() + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_load_table_from_dataframe_w_parquet_options_none(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.schema import SchemaField + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + job_config = job.LoadJobConfig( + write_disposition=job.WriteDisposition.WRITE_TRUNCATE, + source_format=job.SourceFormat.PARQUET, + ) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock( + schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")] + ), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + with load_patch as load_table_from_file, get_table_patch as get_table: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION + ) + + # no need to fetch and inspect table schema for WRITE_TRUNCATE jobs + assert not get_table.called + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=self.LOCATION, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.parquet_options.enable_list_inference is True + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_load_table_from_dataframe_w_list_inference_none(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.schema import SchemaField + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + parquet_options = ParquetOptions() + + job_config = job.LoadJobConfig( + write_disposition=job.WriteDisposition.WRITE_TRUNCATE, + source_format=job.SourceFormat.PARQUET, + ) + job_config.parquet_options = parquet_options + + original_config_copy = copy.deepcopy(job_config) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock( + schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")] + ), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + with load_patch as load_table_from_file, get_table_patch as get_table: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION + ) + + # no need to fetch and inspect table schema for WRITE_TRUNCATE jobs + assert not get_table.called + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=self.LOCATION, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.parquet_options.enable_list_inference is None + + # the original config object should not have been modified + assert job_config.to_api_repr() == original_config_copy.to_api_repr() + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_load_table_from_dataframe_w_list_inference_false(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.schema import SchemaField + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + parquet_options = ParquetOptions() + parquet_options.enable_list_inference = False + + job_config = job.LoadJobConfig( + write_disposition=job.WriteDisposition.WRITE_TRUNCATE, + source_format=job.SourceFormat.PARQUET, + ) + job_config.parquet_options = parquet_options + + original_config_copy = copy.deepcopy(job_config) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock( + schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")] + ), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + with load_patch as load_table_from_file, get_table_patch as get_table: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION + ) + + # no need to fetch and inspect table schema for WRITE_TRUNCATE jobs + assert not get_table.called + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=self.LOCATION, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.parquet_options.enable_list_inference is False + + # the original config object should not have been modified + assert job_config.to_api_repr() == original_config_copy.to_api_repr() + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_load_table_from_dataframe_w_custom_job_config_w_wrong_source_format(self): from google.cloud.bigquery import job @@ -7253,6 +7424,122 @@ def test_load_table_from_dataframe_struct_fields(self): assert sent_config.source_format == job.SourceFormat.PARQUET assert sent_config.schema == schema + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_load_table_from_dataframe_array_fields(self): + """Test that a DataFrame with array columns can be uploaded correctly. + + See: https://github.com/googleapis/python-bigquery/issues/19 + """ + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.schema import SchemaField + + client = self._make_client() + + records = [(3.14, [1, 2])] + dataframe = pandas.DataFrame( + data=records, columns=["float_column", "array_column"] + ) + + schema = [ + SchemaField("float_column", "FLOAT"), + SchemaField("array_column", "INTEGER", mode="REPEATED",), + ] + job_config = job.LoadJobConfig(schema=schema) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_dataframe( + dataframe, + self.TABLE_REF, + job_config=job_config, + location=self.LOCATION, + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=self.LOCATION, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.PARQUET + assert sent_config.schema == schema + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_load_table_from_dataframe_array_fields_w_auto_schema(self): + """Test that a DataFrame with array columns can be uploaded correctly. + + See: https://github.com/googleapis/python-bigquery/issues/19 + """ + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.schema import SchemaField + + client = self._make_client() + + records = [(3.14, [1, 2])] + dataframe = pandas.DataFrame( + data=records, columns=["float_column", "array_column"] + ) + + expected_schema = [ + SchemaField("float_column", "FLOAT"), + SchemaField("array_column", "INT64", mode="REPEATED",), + ] + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, location=self.LOCATION, + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + num_retries=_DEFAULT_NUM_RETRIES, + rewind=True, + size=mock.ANY, + job_id=mock.ANY, + job_id_prefix=None, + location=self.LOCATION, + project=None, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.PARQUET + assert sent_config.schema == expected_schema + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_load_table_from_dataframe_w_partial_schema(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py index 3dc9dd179..3ef61d738 100644 --- a/tests/unit/test_external_config.py +++ b/tests/unit/test_external_config.py @@ -163,7 +163,7 @@ def test_to_api_repr_sheets(self): options = external_config.GoogleSheetsOptions() options.skip_leading_rows = 123 options.range = "Sheet1!A5:B10" - ec._options = options + ec.google_sheets_options = options exp_resource = { "sourceFormat": "GOOGLE_SHEETS", @@ -277,7 +277,7 @@ def test_to_api_repr_csv(self): options.quote_character = "quote" options.skip_leading_rows = 123 options.allow_jagged_rows = False - ec._options = options + ec.csv_options = options exp_resource = { "sourceFormat": "CSV", @@ -368,7 +368,7 @@ def test_to_api_repr_bigtable(self): options = external_config.BigtableOptions() options.ignore_unspecified_column_families = True options.read_rowkey_as_string = False - ec._options = options + ec.bigtable_options = options fam1 = external_config.BigtableColumnFamily() fam1.family_id = "familyId" @@ -425,10 +425,166 @@ def test_to_api_repr_bigtable(self): self.assertEqual(got_resource, exp_resource) - def test_parquet_options_getter(self): + def test_avro_options_getter_and_setter(self): + from google.cloud.bigquery.external_config import AvroOptions + + options = AvroOptions.from_api_repr({"useAvroLogicalTypes": True}) + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.AVRO) + + self.assertIsNone(ec.avro_options.use_avro_logical_types) + + ec.avro_options = options + + self.assertTrue(ec.avro_options.use_avro_logical_types) + self.assertIs( + ec.options._properties, ec._properties[AvroOptions._RESOURCE_NAME] + ) + self.assertIs( + ec.avro_options._properties, ec._properties[AvroOptions._RESOURCE_NAME] + ) + + def test_avro_options_getter_empty(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.AVRO) + self.assertIsNotNone(ec.avro_options) + + def test_avro_options_getter_wrong_format(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + self.assertIsNone(ec.avro_options) + + def test_avro_options_setter_wrong_format(self): + from google.cloud.bigquery.format_options import AvroOptions + + options = AvroOptions() + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + + with self.assertRaisesRegex(TypeError, "Cannot set.*source format is CSV"): + ec.avro_options = options + + def test_bigtable_options_getter_and_setter(self): + from google.cloud.bigquery.external_config import BigtableOptions + + options = BigtableOptions.from_api_repr( + {"ignoreUnspecifiedColumnFamilies": True, "readRowkeyAsString": False} + ) + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.BIGTABLE + ) + + self.assertIsNone(ec.bigtable_options.ignore_unspecified_column_families) + self.assertIsNone(ec.bigtable_options.read_rowkey_as_string) + + ec.bigtable_options = options + + self.assertTrue(ec.bigtable_options.ignore_unspecified_column_families) + self.assertFalse(ec.bigtable_options.read_rowkey_as_string) + self.assertIs( + ec.options._properties, ec._properties[BigtableOptions._RESOURCE_NAME] + ) + self.assertIs( + ec.bigtable_options._properties, + ec._properties[BigtableOptions._RESOURCE_NAME], + ) + + def test_bigtable_options_getter_empty(self): + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.BIGTABLE + ) + self.assertIsNotNone(ec.bigtable_options) + + def test_bigtable_options_getter_wrong_format(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + self.assertIsNone(ec.bigtable_options) + + def test_bigtable_options_setter_wrong_format(self): + from google.cloud.bigquery.external_config import BigtableOptions + + options = BigtableOptions() + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + + with self.assertRaisesRegex(TypeError, "Cannot set.*source format is CSV"): + ec.bigtable_options = options + + def test_csv_options_getter_and_setter(self): + from google.cloud.bigquery.external_config import CSVOptions + + options = CSVOptions.from_api_repr( + {"allowJaggedRows": True, "allowQuotedNewlines": False} + ) + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + + self.assertIsNone(ec.csv_options.allow_jagged_rows) + self.assertIsNone(ec.csv_options.allow_quoted_newlines) + + ec.csv_options = options + + self.assertTrue(ec.csv_options.allow_jagged_rows) + self.assertFalse(ec.csv_options.allow_quoted_newlines) + self.assertIs(ec.options._properties, ec._properties[CSVOptions._RESOURCE_NAME]) + self.assertIs( + ec.csv_options._properties, ec._properties[CSVOptions._RESOURCE_NAME] + ) + + def test_csv_options_getter_empty(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + self.assertIsNotNone(ec.csv_options) + + def test_csv_options_getter_wrong_format(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.AVRO) + self.assertIsNone(ec.csv_options) + + def test_csv_options_setter_wrong_format(self): + from google.cloud.bigquery.external_config import CSVOptions + + options = CSVOptions() + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.AVRO) + + with self.assertRaisesRegex(TypeError, "Cannot set.*source format is AVRO"): + ec.csv_options = options + + def test_google_sheets_options_getter_and_setter(self): + from google.cloud.bigquery.external_config import GoogleSheetsOptions + + options = GoogleSheetsOptions.from_api_repr({"skipLeadingRows": "123"}) + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.GOOGLE_SHEETS + ) + + self.assertIsNone(ec.google_sheets_options.skip_leading_rows) + + ec.google_sheets_options = options + + self.assertEqual(ec.google_sheets_options.skip_leading_rows, 123) + self.assertIs( + ec.options._properties, ec._properties[GoogleSheetsOptions._RESOURCE_NAME] + ) + self.assertIs( + ec.google_sheets_options._properties, + ec._properties[GoogleSheetsOptions._RESOURCE_NAME], + ) + + def test_google_sheets_options_getter_empty(self): + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.GOOGLE_SHEETS + ) + self.assertIsNotNone(ec.google_sheets_options) + + def test_google_sheets_options_getter_wrong_format(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + self.assertIsNone(ec.google_sheets_options) + + def test_google_sheets_options_setter_wrong_format(self): + from google.cloud.bigquery.external_config import GoogleSheetsOptions + + options = GoogleSheetsOptions() + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + + with self.assertRaisesRegex(TypeError, "Cannot set.*source format is CSV"): + ec.google_sheets_options = options + + def test_parquet_options_getter_and_setter(self): from google.cloud.bigquery.format_options import ParquetOptions - parquet_options = ParquetOptions.from_api_repr( + options = ParquetOptions.from_api_repr( {"enumAsString": True, "enableListInference": False} ) ec = external_config.ExternalConfig( @@ -438,32 +594,50 @@ def test_parquet_options_getter(self): self.assertIsNone(ec.parquet_options.enum_as_string) self.assertIsNone(ec.parquet_options.enable_list_inference) - ec._options = parquet_options + ec.parquet_options = options self.assertTrue(ec.parquet_options.enum_as_string) self.assertFalse(ec.parquet_options.enable_list_inference) + self.assertIs( + ec.options._properties, ec._properties[ParquetOptions._RESOURCE_NAME] + ) + self.assertIs( + ec.parquet_options._properties, + ec._properties[ParquetOptions._RESOURCE_NAME], + ) - self.assertIs(ec.parquet_options, ec.options) - - def test_parquet_options_getter_non_parquet_format(self): - ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) - self.assertIsNone(ec.parquet_options) + def test_parquet_options_set_properties(self): + """Check that setting sub-properties works without having to create a + new ParquetOptions instance. - def test_parquet_options_setter(self): - from google.cloud.bigquery.format_options import ParquetOptions + This is required for compatibility with previous + ExternalConfig._options implementation. + """ - parquet_options = ParquetOptions.from_api_repr( - {"enumAsString": False, "enableListInference": True} - ) ec = external_config.ExternalConfig( external_config.ExternalSourceFormat.PARQUET ) - ec.parquet_options = parquet_options + self.assertIsNone(ec.parquet_options.enum_as_string) + self.assertIsNone(ec.parquet_options.enable_list_inference) + + ec.parquet_options.enum_as_string = True + ec.parquet_options.enable_list_inference = False + + self.assertTrue(ec.options.enum_as_string) + self.assertFalse(ec.options.enable_list_inference) + self.assertTrue(ec.parquet_options.enum_as_string) + self.assertFalse(ec.parquet_options.enable_list_inference) + + def test_parquet_options_getter_empty(self): + ec = external_config.ExternalConfig( + external_config.ExternalSourceFormat.PARQUET + ) + self.assertIsNotNone(ec.parquet_options) - # Setting Parquet options should be reflected in the generic options attribute. - self.assertFalse(ec.options.enum_as_string) - self.assertTrue(ec.options.enable_list_inference) + def test_parquet_options_getter_non_parquet_format(self): + ec = external_config.ExternalConfig(external_config.ExternalSourceFormat.CSV) + self.assertIsNone(ec.parquet_options) def test_parquet_options_setter_non_parquet_format(self): from google.cloud.bigquery.format_options import ParquetOptions @@ -514,7 +688,7 @@ def test_to_api_repr_parquet(self): options = ParquetOptions.from_api_repr( dict(enumAsString=False, enableListInference=True) ) - ec._options = options + ec.parquet_options = options exp_resource = { "sourceFormat": external_config.ExternalSourceFormat.PARQUET, @@ -584,6 +758,117 @@ def test_to_api_repr_decimal_target_types_unset(self): ec.decimal_target_types = None # No error if unsetting when already unset. +class BigtableOptions(unittest.TestCase): + def test_to_api_repr(self): + options = external_config.BigtableOptions() + family1 = external_config.BigtableColumnFamily() + column1 = external_config.BigtableColumn() + column1.qualifier_string = "col1" + column1.field_name = "bqcol1" + column1.type_ = "FLOAT" + column1.encoding = "TEXT" + column1.only_read_latest = True + column2 = external_config.BigtableColumn() + column2.qualifier_encoded = b"col2" + column2.field_name = "bqcol2" + column2.type_ = "STRING" + column2.only_read_latest = False + family1.family_id = "family1" + family1.type_ = "INTEGER" + family1.encoding = "BINARY" + family1.columns = [column1, column2] + family1.only_read_latest = False + family2 = external_config.BigtableColumnFamily() + column3 = external_config.BigtableColumn() + column3.qualifier_string = "col3" + family2.family_id = "family2" + family2.type_ = "BYTES" + family2.encoding = "TEXT" + family2.columns = [column3] + family2.only_read_latest = True + options.column_families = [family1, family2] + options.ignore_unspecified_column_families = False + options.read_rowkey_as_string = True + + resource = options.to_api_repr() + + expected_column_families = [ + { + "familyId": "family1", + "type": "INTEGER", + "encoding": "BINARY", + "columns": [ + { + "qualifierString": "col1", + "fieldName": "bqcol1", + "type": "FLOAT", + "encoding": "TEXT", + "onlyReadLatest": True, + }, + { + "qualifierEncoded": "Y29sMg==", + "fieldName": "bqcol2", + "type": "STRING", + "onlyReadLatest": False, + }, + ], + "onlyReadLatest": False, + }, + { + "familyId": "family2", + "type": "BYTES", + "encoding": "TEXT", + "columns": [{"qualifierString": "col3"}], + "onlyReadLatest": True, + }, + ] + self.maxDiff = None + self.assertEqual( + resource, + { + "columnFamilies": expected_column_families, + "ignoreUnspecifiedColumnFamilies": False, + "readRowkeyAsString": True, + }, + ) + + +class CSVOptions(unittest.TestCase): + def test_to_api_repr(self): + options = external_config.CSVOptions() + options.field_delimiter = "\t" + options.skip_leading_rows = 42 + options.quote_character = '"' + options.allow_quoted_newlines = True + options.allow_jagged_rows = False + options.encoding = "UTF-8" + + resource = options.to_api_repr() + + self.assertEqual( + resource, + { + "fieldDelimiter": "\t", + "skipLeadingRows": "42", + "quote": '"', + "allowQuotedNewlines": True, + "allowJaggedRows": False, + "encoding": "UTF-8", + }, + ) + + +class TestGoogleSheetsOptions(unittest.TestCase): + def test_to_api_repr(self): + options = external_config.GoogleSheetsOptions() + options.range = "sheet1!A1:B20" + options.skip_leading_rows = 107 + + resource = options.to_api_repr() + + self.assertEqual(resource, {"range": "sheet1!A1:B20", "skipLeadingRows": "107"}) + + def _copy_and_update(d, u): d = copy.deepcopy(d) d.update(u) diff --git a/tests/unit/test_format_options.py b/tests/unit/test_format_options.py index ab5f9e05c..c8fecbfa6 100644 --- a/tests/unit/test_format_options.py +++ b/tests/unit/test_format_options.py @@ -13,6 +13,29 @@ # limitations under the License. +class TestAvroOptions: + @staticmethod + def _get_target_class(): + from google.cloud.bigquery.format_options import AvroOptions + + return AvroOptions + + def test_ctor(self): + config = self._get_target_class()() + assert config.use_avro_logical_types is None + + def test_from_api_repr(self): + config = self._get_target_class().from_api_repr({"useAvroLogicalTypes": True}) + assert config.use_avro_logical_types + + def test_to_api_repr(self): + config = self._get_target_class()() + config.use_avro_logical_types = False + + result = config.to_api_repr() + assert result == {"useAvroLogicalTypes": False} + + class TestParquetOptions: @staticmethod def _get_target_class(): diff --git a/tests/unit/test_legacy_types.py b/tests/unit/test_legacy_types.py index 49ccb8e5a..4638d3762 100644 --- a/tests/unit/test_legacy_types.py +++ b/tests/unit/test_legacy_types.py @@ -15,7 +15,7 @@ import warnings -def test_imprting_legacy_types_emits_warning(): +def test_importing_legacy_types_emits_warning(): with warnings.catch_warnings(record=True) as warned: from google.cloud.bigquery_v2 import types # noqa: F401