Skip to content

Commit ffc5097

Browse files
author
Artemy Kolchinsky
committed
ENH: Infer dtype from non-nulls when pushing to SQL
Minor cleanup Minor rename Simplifying Code review fixes Complex numbers Release note Release note
1 parent 406c84d commit ffc5097

File tree

3 files changed

+118
-54
lines changed

3 files changed

+118
-54
lines changed

doc/source/whatsnew/v0.15.2.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ Enhancements
115115
- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default) (:issue:`8904`)
116116
- Added ``axvlines`` boolean option to parallel_coordinates plot function, determines whether vertical lines will be printed, default is True
117117
- Added ability to read table footers to read_html (:issue:`8552`)
118+
- ``to_sql`` now infers datatypes of non-NA values for columns that contain NA values and have dtype ``object`` (:issue:`8778`).
118119

119120
.. _whatsnew_0152.performance:
120121

pandas/io/sql.py

Lines changed: 54 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -885,37 +885,56 @@ def _harmonize_columns(self, parse_dates=None):
885885
except KeyError:
886886
pass # this column not in results
887887

888+
def _get_notnull_col_dtype(self, col):
889+
"""
890+
Infer datatype of the Series col. In case the dtype of col is 'object'
891+
and it contains NA values, this infers the datatype of the not-NA
892+
values. Needed for inserting typed data containing NULLs, GH8778.
893+
"""
894+
col_for_inference = col
895+
if col.dtype == 'object':
896+
notnulldata = col[~isnull(col)]
897+
if len(notnulldata):
898+
col_for_inference = notnulldata
899+
900+
return lib.infer_dtype(col_for_inference)
901+
888902
def _sqlalchemy_type(self, col):
889-
from sqlalchemy.types import (BigInteger, Float, Text, Boolean,
890-
DateTime, Date, Time)
891903

892904
dtype = self.dtype or {}
893905
if col.name in dtype:
894906
return self.dtype[col.name]
895907

896-
if com.is_datetime64_dtype(col):
908+
col_type = self._get_notnull_col_dtype(col)
909+
910+
from sqlalchemy.types import (BigInteger, Float, Text, Boolean,
911+
DateTime, Date, Time)
912+
913+
if col_type == 'datetime64':
897914
try:
898915
tz = col.tzinfo
899916
return DateTime(timezone=True)
900917
except:
901918
return DateTime
902-
if com.is_timedelta64_dtype(col):
919+
if col_type == 'timedelta64':
903920
warnings.warn("the 'timedelta' type is not supported, and will be "
904921
"written as integer values (ns frequency) to the "
905922
"database.", UserWarning)
906923
return BigInteger
907-
elif com.is_float_dtype(col):
924+
elif col_type == 'floating':
908925
return Float
909-
elif com.is_integer_dtype(col):
926+
elif col_type == 'integer':
910927
# TODO: Refine integer size.
911928
return BigInteger
912-
elif com.is_bool_dtype(col):
929+
elif col_type == 'boolean':
913930
return Boolean
914-
inferred = lib.infer_dtype(com._ensure_object(col))
915-
if inferred == 'date':
931+
elif col_type == 'date':
916932
return Date
917-
if inferred == 'time':
933+
elif col_type == 'time':
918934
return Time
935+
elif col_type == 'complex':
936+
raise ValueError('Complex datatypes not supported')
937+
919938
return Text
920939

921940
def _numpy_type(self, sqltype):
@@ -1187,15 +1206,15 @@ def _create_sql_schema(self, frame, table_name, keys=None):
11871206
# SQLAlchemy installed
11881207
# SQL type convertions for each DB
11891208
_SQL_TYPES = {
1190-
'text': {
1209+
'string': {
11911210
'mysql': 'VARCHAR (63)',
11921211
'sqlite': 'TEXT',
11931212
},
1194-
'float': {
1213+
'floating': {
11951214
'mysql': 'FLOAT',
11961215
'sqlite': 'REAL',
11971216
},
1198-
'int': {
1217+
'integer': {
11991218
'mysql': 'BIGINT',
12001219
'sqlite': 'INTEGER',
12011220
},
@@ -1211,12 +1230,13 @@ def _create_sql_schema(self, frame, table_name, keys=None):
12111230
'mysql': 'TIME',
12121231
'sqlite': 'TIME',
12131232
},
1214-
'bool': {
1233+
'boolean': {
12151234
'mysql': 'BOOLEAN',
12161235
'sqlite': 'INTEGER',
12171236
}
12181237
}
12191238

1239+
12201240
# SQL enquote and wildcard symbols
12211241
_SQL_SYMB = {
12221242
'mysql': {
@@ -1291,8 +1311,8 @@ def _create_table_setup(self):
12911311
br_l = _SQL_SYMB[flv]['br_l'] # left val quote char
12921312
br_r = _SQL_SYMB[flv]['br_r'] # right val quote char
12931313

1294-
create_tbl_stmts = [(br_l + '%s' + br_r + ' %s') % (cname, ctype)
1295-
for cname, ctype, _ in column_names_and_types]
1314+
create_tbl_stmts = [(br_l + '%s' + br_r + ' %s') % (cname, col_type)
1315+
for cname, col_type, _ in column_names_and_types]
12961316
if self.keys is not None and len(self.keys):
12971317
cnames_br = ",".join([br_l + c + br_r for c in self.keys])
12981318
create_tbl_stmts.append(
@@ -1317,30 +1337,27 @@ def _sql_type_name(self, col):
13171337
dtype = self.dtype or {}
13181338
if col.name in dtype:
13191339
return dtype[col.name]
1320-
pytype = col.dtype.type
1321-
pytype_name = "text"
1322-
if issubclass(pytype, np.floating):
1323-
pytype_name = "float"
1324-
elif com.is_timedelta64_dtype(pytype):
1340+
1341+
col_type = self._get_notnull_col_dtype(col)
1342+
if col_type == 'timedelta64':
13251343
warnings.warn("the 'timedelta' type is not supported, and will be "
13261344
"written as integer values (ns frequency) to the "
13271345
"database.", UserWarning)
1328-
pytype_name = "int"
1329-
elif issubclass(pytype, np.integer):
1330-
pytype_name = "int"
1331-
elif issubclass(pytype, np.datetime64) or pytype is datetime:
1332-
# Caution: np.datetime64 is also a subclass of np.number.
1333-
pytype_name = "datetime"
1334-
elif issubclass(pytype, np.bool_):
1335-
pytype_name = "bool"
1336-
elif issubclass(pytype, np.object):
1337-
pytype = lib.infer_dtype(com._ensure_object(col))
1338-
if pytype == "date":
1339-
pytype_name = "date"
1340-
elif pytype == "time":
1341-
pytype_name = "time"
1342-
1343-
return _SQL_TYPES[pytype_name][self.pd_sql.flavor]
1346+
col_type = "integer"
1347+
1348+
elif col_type == "datetime64":
1349+
col_type = "datetime"
1350+
1351+
elif col_type == "empty":
1352+
col_type = "string"
1353+
1354+
elif col_type == "complex":
1355+
raise ValueError('Complex datatypes not supported')
1356+
1357+
if col_type not in _SQL_TYPES:
1358+
col_type = "string"
1359+
1360+
return _SQL_TYPES[col_type][self.pd_sql.flavor]
13441361

13451362

13461363
class SQLiteDatabase(PandasSQL):

pandas/io/tests/test_sql.py

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,11 @@ def test_timedelta(self):
560560
result = sql.read_sql_query('SELECT * FROM test_timedelta', self.conn)
561561
tm.assert_series_equal(result['foo'], df['foo'].astype('int64'))
562562

563+
def test_complex(self):
564+
df = DataFrame({'a':[1+1j, 2j]})
565+
# Complex data type should raise error
566+
self.assertRaises(ValueError, df.to_sql, 'test_complex', self.conn)
567+
563568
def test_to_sql_index_label(self):
564569
temp_frame = DataFrame({'col1': range(4)})
565570

@@ -1175,19 +1180,38 @@ def test_dtype(self):
11751180
(0.9, None)]
11761181
df = DataFrame(data, columns=cols)
11771182
df.to_sql('dtype_test', self.conn)
1178-
df.to_sql('dtype_test2', self.conn, dtype={'B': sqlalchemy.Boolean})
1183+
df.to_sql('dtype_test2', self.conn, dtype={'B': sqlalchemy.TEXT})
1184+
meta = sqlalchemy.schema.MetaData(bind=self.conn)
1185+
meta.reflect()
1186+
sqltype = meta.tables['dtype_test2'].columns['B'].type
1187+
self.assertTrue(isinstance(sqltype, sqlalchemy.TEXT))
1188+
self.assertRaises(ValueError, df.to_sql,
1189+
'error', self.conn, dtype={'B': str})
1190+
1191+
def test_notnull_dtype(self):
1192+
cols = {'Bool': Series([True,None]),
1193+
'Date': Series([datetime(2012, 5, 1), None]),
1194+
'Int' : Series([1, None], dtype='object'),
1195+
'Float': Series([1.1, None])
1196+
}
1197+
df = DataFrame(cols)
1198+
1199+
tbl = 'notnull_dtype_test'
1200+
df.to_sql(tbl, self.conn)
1201+
returned_df = sql.read_sql_table(tbl, self.conn)
11791202
meta = sqlalchemy.schema.MetaData(bind=self.conn)
11801203
meta.reflect()
1181-
self.assertTrue(isinstance(meta.tables['dtype_test'].columns['B'].type,
1182-
sqltypes.TEXT))
11831204
if self.flavor == 'mysql':
11841205
my_type = sqltypes.Integer
11851206
else:
11861207
my_type = sqltypes.Boolean
1187-
self.assertTrue(isinstance(meta.tables['dtype_test2'].columns['B'].type,
1188-
my_type))
1189-
self.assertRaises(ValueError, df.to_sql,
1190-
'error', self.conn, dtype={'B': bool})
1208+
1209+
col_dict = meta.tables[tbl].columns
1210+
1211+
self.assertTrue(isinstance(col_dict['Bool'].type, my_type))
1212+
self.assertTrue(isinstance(col_dict['Date'].type, sqltypes.DateTime))
1213+
self.assertTrue(isinstance(col_dict['Int'].type, sqltypes.Integer))
1214+
self.assertTrue(isinstance(col_dict['Float'].type, sqltypes.Float))
11911215

11921216

11931217
class TestSQLiteAlchemy(_TestSQLAlchemy):
@@ -1507,6 +1531,13 @@ def test_to_sql_save_index(self):
15071531
def test_transactions(self):
15081532
self._transaction_test()
15091533

1534+
def _get_sqlite_column_type(self, table, column):
1535+
recs = self.conn.execute('PRAGMA table_info(%s)' % table)
1536+
for cid, name, ctype, not_null, default, pk in recs:
1537+
if name == column:
1538+
return ctype
1539+
raise ValueError('Table %s, column %s not found' % (table, column))
1540+
15101541
def test_dtype(self):
15111542
if self.flavor == 'mysql':
15121543
raise nose.SkipTest('Not applicable to MySQL legacy')
@@ -1515,20 +1546,35 @@ def test_dtype(self):
15151546
(0.9, None)]
15161547
df = DataFrame(data, columns=cols)
15171548
df.to_sql('dtype_test', self.conn)
1518-
df.to_sql('dtype_test2', self.conn, dtype={'B': 'bool'})
1549+
df.to_sql('dtype_test2', self.conn, dtype={'B': 'STRING'})
15191550

1520-
def get_column_type(table, column):
1521-
recs = self.conn.execute('PRAGMA table_info(%s)' % table)
1522-
for cid, name, ctype, not_null, default, pk in recs:
1523-
if name == column:
1524-
return ctype
1525-
raise ValueError('Table %s, column %s not found' % (table, column))
1526-
1527-
self.assertEqual(get_column_type('dtype_test', 'B'), 'TEXT')
1528-
self.assertEqual(get_column_type('dtype_test2', 'B'), 'bool')
1551+
# sqlite stores Boolean values as INTEGER
1552+
self.assertEqual(self._get_sqlite_column_type('dtype_test', 'B'), 'INTEGER')
1553+
1554+
self.assertEqual(self._get_sqlite_column_type('dtype_test2', 'B'), 'STRING')
15291555
self.assertRaises(ValueError, df.to_sql,
15301556
'error', self.conn, dtype={'B': bool})
15311557

1558+
def test_notnull_dtype(self):
1559+
if self.flavor == 'mysql':
1560+
raise nose.SkipTest('Not applicable to MySQL legacy')
1561+
1562+
cols = {'Bool': Series([True,None]),
1563+
'Date': Series([datetime(2012, 5, 1), None]),
1564+
'Int' : Series([1, None], dtype='object'),
1565+
'Float': Series([1.1, None])
1566+
}
1567+
df = DataFrame(cols)
1568+
1569+
tbl = 'notnull_dtype_test'
1570+
df.to_sql(tbl, self.conn)
1571+
1572+
self.assertEqual(self._get_sqlite_column_type(tbl, 'Bool'), 'INTEGER')
1573+
self.assertEqual(self._get_sqlite_column_type(tbl, 'Date'), 'TIMESTAMP')
1574+
self.assertEqual(self._get_sqlite_column_type(tbl, 'Int'), 'INTEGER')
1575+
self.assertEqual(self._get_sqlite_column_type(tbl, 'Float'), 'REAL')
1576+
1577+
15321578
class TestMySQLLegacy(TestSQLiteFallback):
15331579
"""
15341580
Test the legacy mode against a MySQL database.

0 commit comments

Comments
 (0)