BUG: Fix uploading of dataframes containing int64 and float64 columns

tswast · tswast · commit 616d30681509 · 2018-02-09T17:46:51.000-08:00
Fixes googleapis#116 and googleapis#96 by loading data in CSV chunks.
diff --git a/pandas_gbq/_load.py b/pandas_gbq/_load.py
@@ -0,0 +1,51 @@
+"""Helper methods for loading data into BigQuery"""
+
+from google.cloud import bigquery
+import six
+
+
+def encode_chunk(dataframe):
+    """Return a file-like object of CSV-encoded rows.
+
+    Args:
+      dataframe (pandas.DataFrame): A chunk of a dataframe to encode
+    """
+    csv_buffer = six.StringIO()
+    dataframe.to_csv(
+        csv_buffer, index=False, header=False, encoding='utf-8',
+        date_format='%Y-%m-%d %H:%M')
+
+    # Convert to a BytesIO buffer so that unicode text is properly handled.
+    # See: https://github.com/pydata/pandas-gbq/issues/106
+    body = csv_buffer.getvalue()
+    if isinstance(body, bytes):
+        body = body.decode('utf-8')
+    body = body.encode('utf-8')
+    return six.BytesIO(body)
+
+
+def encode_chunks(dataframe, chunksize):
+    dataframe = dataframe.reset_index(drop=True)
+    remaining_rows = len(dataframe)
+    total_rows = remaining_rows
+    start_index = 0
+    while start_index < total_rows:
+        chunk_buffer = encode_chunk(
+            dataframe[start_index:start_index+chunksize])
+        start_index += chunksize
+        remaining_rows = max(0, remaining_rows - chunksize)
+        yield remaining_rows, chunk_buffer
+
+
+def load_chunks(client, dataframe, dataset_id, table_id, chunksize):
+    destination_table = client.dataset(dataset_id).table(table_id)
+    job_config = bigquery.LoadJobConfig()
+    job_config.write_disposition = 'WRITE_APPEND'
+    job_config.source_format = 'CSV'
+
+    for remaining_rows, chunk_buffer in encode_chunks(dataframe, chunksize):
+        yield remaining_rows
+        client.load_table_from_file(
+            chunk_buffer,
+            destination_table,
+            job_config=job_config).result()
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -557,44 +557,18 @@ def run_query(self, query, **kwargs):
         return schema, result_rows
 
     def load_data(self, dataframe, dataset_id, table_id, chunksize):
-        from google.cloud.bigquery import LoadJobConfig
-        from six import BytesIO
+        from pandas_gbq import _load
 
-        destination_table = self.client.dataset(dataset_id).table(table_id)
-        job_config = LoadJobConfig()
-        job_config.write_disposition = 'WRITE_APPEND'
-        job_config.source_format = 'NEWLINE_DELIMITED_JSON'
-        rows = []
-        remaining_rows = len(dataframe)
-
-        total_rows = remaining_rows
+        total_rows = len(dataframe)
         self._print("\n\n")
 
-        for index, row in dataframe.reset_index(drop=True).iterrows():
-            row_json = row.to_json(
-                force_ascii=False, date_unit='s', date_format='iso')
-            rows.append(row_json)
-            remaining_rows -= 1
-
-            if (len(rows) % chunksize == 0) or (remaining_rows == 0):
+        try:
+            for remaining_rows in _load.load_chunks(
+                    self.client, dataframe, dataset_id, table_id, chunksize):
                 self._print("\rLoad is {0}% Complete".format(
                     ((total_rows - remaining_rows) * 100) / total_rows))
-
-                body = '{}\n'.format('\n'.join(rows))
-                if isinstance(body, bytes):
-                    body = body.decode('utf-8')
-                body = body.encode('utf-8')
-                body = BytesIO(body)
-
-                try:
-                    self.client.load_table_from_file(
-                        body,
-                        destination_table,
-                        job_config=job_config).result()
-                except self.http_error as ex:
-                    self.process_http_error(ex)
-
-                rows = []
+        except self.http_error as ex:
+            self.process_http_error(ex)
 
         self._print("\n")
 
diff --git a/pandas_gbq/tests/test__load.py b/pandas_gbq/tests/test__load.py
@@ -0,0 +1,26 @@
+
+import numpy
+import pandas
+
+
+def test_encode_chunk_with_unicode():
+    """Test that a dataframe containing unicode can be encoded as a file.
+
+    See: https://github.com/pydata/pandas-gbq/issues/106
+    """
+    from pandas_gbq._load import encode_chunk
+
+    df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6),
+                    columns=list('ABCD'))
+    df['s'] = u'信用卡'
+    csv_buffer = encode_chunk(df)
+    csv_bytes = csv_buffer.read()
+    csv_string = csv_bytes.decode('utf-8')
+    assert u'信用卡' in csv_string
+
+
+def test_encode_chunks_splits_dataframe():
+    from pandas_gbq._load import encode_chunks
+    df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6))
+    num_chunks = len(list(encode_chunks(df, 2)))
+    assert num_chunks == 3
diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py
@@ -1218,6 +1218,30 @@ def test_upload_other_unicode_data(self):
 
         tm.assert_numpy_array_equal(expected.values, result.values)
 
+    def test_upload_mixed_float_and_int(self):
+        """Test that we can upload a dataframe containing an int64 and float64 column.
+        See: https://github.com/pydata/pandas-gbq/issues/116
+        """
+        test_id = "mixed_float_and_int"
+        test_size = 2
+        df = DataFrame(
+            [[1,1.1],[2,2.2]],
+            index=['row 1', 'row 2'],
+            columns=['intColumn','floatColumn'])
+
+        gbq.to_gbq(
+            df, self.destination_table + test_id,
+            _get_project_id(),
+            private_key=_get_private_key_path(),
+            chunksize=10000)
+
+        result_df = gbq.read_gbq("SELECT * FROM {0}".format(
+            self.destination_table + test_id),
+            project_id=_get_project_id(),
+            private_key=_get_private_key_path())
+
+        assert len(result_df) == test_size
+
     def test_generate_schema(self):
         df = tm.makeMixedDataFrame()
         schema = gbq._generate_bq_schema(df)