diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fc5aad12cd5e8..e483cebf71614 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -135,12 +135,10 @@ usecols : list-like or callable, default ``None`` import pandas as pd from io import StringIO - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"]) Using this parameter results in much faster parsing time and lower memory usage. squeeze : boolean, default ``False`` @@ -181,10 +179,7 @@ skiprows : list-like or integer, default ``None`` .. ipython:: python - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) @@ -365,17 +360,14 @@ columns: .. ipython:: python import numpy as np - data = ('a,b,c,d\n' - '1,2,3,4\n' - '5,6,7,8\n' - '9,10,11') + + data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11" print(data) df = pd.read_csv(StringIO(data), dtype=object) df - df['a'][0] - df = pd.read_csv(StringIO(data), - dtype={'b': object, 'c': np.float64, 'd': 'Int64'}) + df["a"][0] + df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"}) df.dtypes Fortunately, pandas offers more than one way to ensure that your column(s) @@ -390,14 +382,10 @@ of :func:`~pandas.read_csv`: .. ipython:: python - data = ("col_1\n" - "1\n" - "2\n" - "'A'\n" - "4.22") - df = pd.read_csv(StringIO(data), converters={'col_1': str}) + data = "col_1\n1\n2\n'A'\n4.22" + df = pd.read_csv(StringIO(data), converters={"col_1": str}) df - df['col_1'].apply(type).value_counts() + df["col_1"].apply(type).value_counts() Or you can use the :func:`~pandas.to_numeric` function to coerce the dtypes after reading in the data, @@ -405,9 +393,9 @@ dtypes after reading in the data, .. ipython:: python df2 = pd.read_csv(StringIO(data)) - df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce') + df2["col_1"] = pd.to_numeric(df2["col_1"], errors="coerce") df2 - df2['col_1'].apply(type).value_counts() + df2["col_1"].apply(type).value_counts() which will convert all valid parsing to floats, leaving the invalid parsing as ``NaN``. @@ -429,12 +417,12 @@ worth trying. .. ipython:: python :okwarning: - col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000)) - df = pd.DataFrame({'col_1': col_1}) - df.to_csv('foo.csv') - mixed_df = pd.read_csv('foo.csv') - mixed_df['col_1'].apply(type).value_counts() - mixed_df['col_1'].dtype + col_1 = list(range(500000)) + ["a", "b"] + list(range(500000)) + df = pd.DataFrame({"col_1": col_1}) + df.to_csv("foo.csv") + mixed_df = pd.read_csv("foo.csv") + mixed_df["col_1"].apply(type).value_counts() + mixed_df["col_1"].dtype will result with ``mixed_df`` containing an ``int`` dtype for certain chunks of the column, and ``str`` for others due to the mixed dtypes from the @@ -445,7 +433,8 @@ worth trying. :suppress: import os - os.remove('foo.csv') + + os.remove("foo.csv") .. _io.categorical: @@ -457,21 +446,18 @@ Specifying categorical dtype .. ipython:: python - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data)).dtypes - pd.read_csv(StringIO(data), dtype='category').dtypes + pd.read_csv(StringIO(data), dtype="category").dtypes Individual columns can be parsed as a ``Categorical`` using a dict specification: .. ipython:: python - pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes Specifying ``dtype='category'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more @@ -482,16 +468,17 @@ that column's ``dtype``. .. ipython:: python from pandas.api.types import CategoricalDtype - dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) - pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes + + dtype = CategoricalDtype(["d", "c", "b", "a"], ordered=True) + pd.read_csv(StringIO(data), dtype={"col1": dtype}).dtypes When using ``dtype=CategoricalDtype``, "unexpected" values outside of ``dtype.categories`` are treated as missing values. .. ipython:: python - dtype = CategoricalDtype(['a', 'b', 'd']) # No 'c' - pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1 + dtype = CategoricalDtype(["a", "b", "d"]) # No 'c' + pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1 This matches the behavior of :meth:`Categorical.set_categories`. @@ -507,11 +494,11 @@ This matches the behavior of :meth:`Categorical.set_categories`. .. ipython:: python - df = pd.read_csv(StringIO(data), dtype='category') + df = pd.read_csv(StringIO(data), dtype="category") df.dtypes - df['col3'] - df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) - df['col3'] + df["col3"] + df["col3"].cat.categories = pd.to_numeric(df["col3"].cat.categories) + df["col3"] Naming and using columns @@ -527,10 +514,7 @@ used as the column names: .. ipython:: python - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = "a,b,c\n1,2,3\n4,5,6\n7,8,9" print(data) pd.read_csv(StringIO(data)) @@ -541,19 +525,15 @@ any): .. ipython:: python print(data) - pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0) - pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None) + pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0) + pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None) If the header is in a row other than the first, pass the row number to ``header``. This will skip the preceding rows: .. ipython:: python - data = ('skip this skip it\n' - 'a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = "skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9" pd.read_csv(StringIO(data), header=1) .. note:: @@ -574,9 +554,7 @@ distinguish between them so as to prevent overwriting data: .. ipython:: python - data = ('a,b,a\n' - '0,1,2\n' - '3,4,5') + data = "a,b,a\n0,1,2\n3,4,5" pd.read_csv(StringIO(data)) There is no more duplicate data because ``mangle_dupe_cols=True`` by default, @@ -613,18 +591,18 @@ file, either using the column names, position numbers or a callable: .. ipython:: python - data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz' + data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz" pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), usecols=['b', 'd']) + pd.read_csv(StringIO(data), usecols=["b", "d"]) pd.read_csv(StringIO(data), usecols=[0, 2, 3]) - pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C']) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["A", "C"]) The ``usecols`` argument can also be used to specify which columns not to use in the final result: .. ipython:: python - pd.read_csv(StringIO(data), usecols=lambda x: x not in ['a', 'c']) + pd.read_csv(StringIO(data), usecols=lambda x: x not in ["a", "c"]) In this case, the callable is specifying that we exclude the "a" and "c" columns from the output. @@ -642,26 +620,15 @@ be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python - data = ('\n' - 'a,b,c\n' - ' \n' - '# commented line\n' - '1,2,3\n' - '\n' - '4,5,6') + data = "\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6" print(data) - pd.read_csv(StringIO(data), comment='#') + pd.read_csv(StringIO(data), comment="#") If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = ('a,b,c\n' - '\n' - '1,2,3\n' - '\n' - '\n' - '4,5,6') + data = "a,b,c\n\n1,2,3\n\n\n4,5,6" pd.read_csv(StringIO(data), skip_blank_lines=False) .. warning:: @@ -672,32 +639,28 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = ('#comment\n' - 'a,b,c\n' - 'A,B,C\n' - '1,2,3') - pd.read_csv(StringIO(data), comment='#', header=1) - data = ('A,B,C\n' - '#comment\n' - 'a,b,c\n' - '1,2,3') - pd.read_csv(StringIO(data), comment='#', skiprows=2) + data = "#comment\na,b,c\nA,B,C\n1,2,3" + pd.read_csv(StringIO(data), comment="#", header=1) + data = "A,B,C\n#comment\na,b,c\n1,2,3" + pd.read_csv(StringIO(data), comment="#", skiprows=2) If both ``header`` and ``skiprows`` are specified, ``header`` will be relative to the end of ``skiprows``. For example: .. ipython:: python - data = ('# empty\n' - '# second empty line\n' - '# third emptyline\n' - 'X,Y,Z\n' - '1,2,3\n' - 'A,B,C\n' - '1,2.,4.\n' - '5.,NaN,10.0\n') + data = ( + "# empty\n" + "# second empty line\n" + "# third emptyline\n" + "X,Y,Z\n" + "1,2,3\n" + "A,B,C\n" + "1,2.,4.\n" + "5.,NaN,10.0\n" + ) print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + pd.read_csv(StringIO(data), comment="#", skiprows=4, header=1) .. _io.comments: @@ -709,36 +672,38 @@ Sometimes comments or meta data may be included in a file: .. ipython:: python :suppress: - data = ("ID,level,category\n" - "Patient1,123000,x # really unpleasant\n" - "Patient2,23000,y # wouldn't take his medicine\n" - "Patient3,1234018,z # awesome") + data = ( + "ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) By default, the parser includes the comments in the output: .. ipython:: python - df = pd.read_csv('tmp.csv') + df = pd.read_csv("tmp.csv") df We can suppress the comments using the ``comment`` keyword: .. ipython:: python - df = pd.read_csv('tmp.csv', comment='#') + df = pd.read_csv("tmp.csv", comment="#") df .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.unicode: @@ -751,13 +716,12 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python from io import BytesIO - data = (b'word,length\n' - b'Tr\xc3\xa4umen,7\n' - b'Gr\xc3\xbc\xc3\x9fe,5') - data = data.decode('utf8').encode('latin-1') - df = pd.read_csv(BytesIO(data), encoding='latin-1') + + data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5" + data = data.decode("utf8").encode("latin-1") + df = pd.read_csv(BytesIO(data), encoding="latin-1") df - df['word'][1] + df["word"][1] Some formats which encode all characters as multiple bytes, like UTF-16, won't parse correctly at all without specifying the encoding. `Full list of Python @@ -774,16 +738,12 @@ first column will be used as the ``DataFrame``'s row names: .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,5.7\n' - '8,orange,cow,10') + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" pd.read_csv(StringIO(data)) .. ipython:: python - data = ('index,a,b,c\n' - '4,apple,bat,5.7\n' - '8,orange,cow,10') + data = "index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" pd.read_csv(StringIO(data), index_col=0) Ordinarily, you can achieve this behavior using the ``index_col`` option. @@ -794,9 +754,7 @@ index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,\n' - '8,orange,cow,') + data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), index_col=False) @@ -806,12 +764,10 @@ If a subset of data is being parsed using the ``usecols`` option, the .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,\n' - '8,orange,cow,') + data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) - pd.read_csv(StringIO(data), usecols=['b', 'c']) - pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0) + pd.read_csv(StringIO(data), usecols=["b", "c"]) + pd.read_csv(StringIO(data), usecols=["b", "c"], index_col=0) .. _io.parse_dates: @@ -831,14 +787,14 @@ The simplest case is to just pass in ``parse_dates=True``: .. ipython:: python :suppress: - f = open('foo.csv', 'w') - f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f = open("foo.csv", "w") + f.write("date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") f.close() .. ipython:: python # Use a column as an index, and parse it as dates. - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True) df # These are Python datetime objects @@ -856,20 +812,22 @@ column names: .. ipython:: python :suppress: - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]]) + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) df By default the parser removes the component date columns, but you can choose @@ -877,8 +835,9 @@ to retain them via the ``keep_date_col`` keyword: .. ipython:: python - df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]], - keep_date_col=True) + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True + ) df Note that if you wish to combine multiple columns into a single date column, a @@ -891,8 +850,8 @@ You can also use a dict to specify custom name columns: .. ipython:: python - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec) + date_spec = {"nominal": [1, 2], "actual": [1, 3]} + df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) df It is important to remember that if multiple text columns are to be parsed into @@ -903,9 +862,10 @@ data columns: .. ipython:: python - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - index_col=0) # index is the nominal column + date_spec = {"nominal": [1, 2], "actual": [1, 3]} + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=date_spec, index_col=0 + ) # index is the nominal column df .. note:: @@ -929,8 +889,9 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=pd.to_datetime) + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=date_spec, date_parser=pd.to_datetime + ) df Pandas will try to call the ``date_parser`` function in three different ways. If @@ -957,7 +918,7 @@ Note that performance-wise, you should try these methods of parsing dates in ord .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.csv.mixed_timezones: @@ -976,17 +937,20 @@ an object-dtype column with strings, even with ``parse_dates``. a 2000-01-01T00:00:00+05:00 2000-01-01T00:00:00+06:00""" - df = pd.read_csv(StringIO(content), parse_dates=['a']) - df['a'] + df = pd.read_csv(StringIO(content), parse_dates=["a"]) + df["a"] To parse the mixed-timezone values as a datetime column, pass a partially-applied :func:`to_datetime` with ``utc=True`` as the ``date_parser``. .. ipython:: python - df = pd.read_csv(StringIO(content), parse_dates=['a'], - date_parser=lambda col: pd.to_datetime(col, utc=True)) - df['a'] + df = pd.read_csv( + StringIO(content), + parse_dates=["a"], + date_parser=lambda col: pd.to_datetime(col, utc=True), + ) + df["a"] .. _io.dayfirst: @@ -1022,14 +986,13 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With .. ipython:: python # Try to infer the format for the index column - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, - infer_datetime_format=True) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) df .. ipython:: python :suppress: - os.remove('foo.csv') + os.remove("foo.csv") International date formats ++++++++++++++++++++++++++ @@ -1040,19 +1003,16 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python :suppress: - data = ("date,value,cat\n" - "1/6/2000,5,a\n" - "2/6/2000,10,b\n" - "3/6/2000,15,c") - with open('tmp.csv', 'w') as fh: + data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) - pd.read_csv('tmp.csv', parse_dates=[0]) - pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) + pd.read_csv("tmp.csv", parse_dates=[0]) + pd.read_csv("tmp.csv", dayfirst=True, parse_dates=[0]) Writing CSVs to binary file objects +++++++++++++++++++++++++++++++++++ @@ -1084,14 +1044,16 @@ writing to a file). For example: .. ipython:: python - val = '0.3066101993807095471566981359501369297504425048828125' - data = 'a,b,c\n1,2,{0}'.format(val) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision=None)['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision='high')['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision='round_trip')['c'][0] - float(val)) + val = "0.3066101993807095471566981359501369297504425048828125" + data = "a,b,c\n1,2,{0}".format(val) + abs(pd.read_csv(StringIO(data), engine="c", float_precision=None)["c"][0] - float(val)) + abs( + pd.read_csv(StringIO(data), engine="c", float_precision="high")["c"][0] - float(val) + ) + abs( + pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0] + - float(val) + ) .. _io.thousands: @@ -1106,20 +1068,22 @@ correctly: .. ipython:: python :suppress: - data = ("ID|level|category\n" - "Patient1|123,000|x\n" - "Patient2|23,000|y\n" - "Patient3|1,234,018|z") + data = ( + "ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) By default, numbers with a thousands separator will be parsed as strings: .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', sep='|') + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", sep="|") df df.level.dtype @@ -1128,8 +1092,8 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', sep='|', thousands=',') + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", sep="|", thousands=",") df df.level.dtype @@ -1137,7 +1101,7 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.na_values: @@ -1162,7 +1126,7 @@ Let us consider some examples: .. code-block:: python - pd.read_csv('path_to_file.csv', na_values=[5]) + pd.read_csv("path_to_file.csv", na_values=[5]) In the example above ``5`` and ``5.0`` will be recognized as ``NaN``, in addition to the defaults. A string will first be interpreted as a numerical @@ -1170,19 +1134,19 @@ addition to the defaults. A string will first be interpreted as a numerical .. code-block:: python - pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=[""]) + pd.read_csv("path_to_file.csv", keep_default_na=False, na_values=[""]) Above, only an empty field will be recognized as ``NaN``. .. code-block:: python - pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=["NA", "0"]) + pd.read_csv("path_to_file.csv", keep_default_na=False, na_values=["NA", "0"]) Above, both ``NA`` and ``0`` as strings are ``NaN``. .. code-block:: python - pd.read_csv('path_to_file.csv', na_values=["Nope"]) + pd.read_csv("path_to_file.csv", na_values=["Nope"]) The default values, in addition to the string ``"Nope"`` are recognized as ``NaN``. @@ -1205,19 +1169,16 @@ as a ``Series``: .. ipython:: python :suppress: - data = ("level\n" - "Patient1,123000\n" - "Patient2,23000\n" - "Patient3,1234018") + data = "level\nPatient1,123000\nPatient2,23000\nPatient3,1234018" - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) - output = pd.read_csv('tmp.csv', squeeze=True) + output = pd.read_csv("tmp.csv", squeeze=True) output type(output) @@ -1225,7 +1186,7 @@ as a ``Series``: .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.boolean: @@ -1239,12 +1200,10 @@ options as follows: .. ipython:: python - data = ('a,b,c\n' - '1,Yes,2\n' - '3,No,4') + data = "a,b,c\n1,Yes,2\n3,No,4" print(data) pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) + pd.read_csv(StringIO(data), true_values=["Yes"], false_values=["No"]) .. _io.bad_lines: @@ -1258,10 +1217,7 @@ too many fields will raise an error by default: .. ipython:: python :okexcept: - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6,7\n' - '8,9,10') + data = "a,b,c\n1,2,3\n4,5,6,7\n8,9,10" pd.read_csv(StringIO(data)) You can elect to skip bad lines: @@ -1301,9 +1257,7 @@ or a :class:`python:csv.Dialect` instance. .. ipython:: python :suppress: - data = ('label1,label2,label3\n' - 'index1,"a,c,e\n' - 'index2,b,d,f') + data = "label1,label2,label3\n" 'index1,"a,c,e\n' "index2,b,d,f" Suppose you had data with unenclosed quotes: @@ -1321,6 +1275,7 @@ We can get around this using ``dialect``: :okwarning: import csv + dia = csv.excel() dia.quoting = csv.QUOTE_NONE pd.read_csv(StringIO(data), dialect=dia) @@ -1329,15 +1284,15 @@ All of the dialect options can be specified separately by keyword arguments: .. ipython:: python - data = 'a,b,c~1,2,3~4,5,6' - pd.read_csv(StringIO(data), lineterminator='~') + data = "a,b,c~1,2,3~4,5,6" + pd.read_csv(StringIO(data), lineterminator="~") Another common dialect option is ``skipinitialspace``, to skip any whitespace after a delimiter: .. ipython:: python - data = 'a, b, c\n1, 2, 3\n4, 5, 6' + data = "a, b, c\n1, 2, 3\n4, 5, 6" print(data) pd.read_csv(StringIO(data), skipinitialspace=True) @@ -1359,7 +1314,7 @@ should pass the ``escapechar`` option: data = 'a,b\n"hello, \\"Bob\\", nice to see you",5' print(data) - pd.read_csv(StringIO(data), escapechar='\\') + pd.read_csv(StringIO(data), escapechar="\\") .. _io.fwf_reader: .. _io.fwf: @@ -1386,12 +1341,14 @@ a different usage of the ``delimiter`` parameter: .. ipython:: python :suppress: - f = open('bar.csv', 'w') - data1 = ("id8141 360.242940 149.910199 11950.7\n" - "id1594 444.953632 166.985655 11788.4\n" - "id1849 364.136849 183.628767 11806.2\n" - "id1230 413.836124 184.375703 11916.8\n" - "id1948 502.953953 173.237159 12468.3") + f = open("bar.csv", "w") + data1 = ( + "id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3" + ) f.write(data1) f.close() @@ -1399,7 +1356,7 @@ Consider a typical fixed-width data file: .. ipython:: python - print(open('bar.csv').read()) + print(open("bar.csv").read()) In order to parse this file into a ``DataFrame``, we simply need to supply the column specifications to the ``read_fwf`` function along with the file name: @@ -1408,7 +1365,7 @@ column specifications to the ``read_fwf`` function along with the file name: # Column specifications are a list of half-intervals colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] - df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0) + df = pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0) df Note how the parser automatically picks column names X. when @@ -1419,7 +1376,7 @@ column widths for contiguous columns: # Widths are a list of integers widths = [6, 14, 13, 10] - df = pd.read_fwf('bar.csv', widths=widths, header=None) + df = pd.read_fwf("bar.csv", widths=widths, header=None) df The parser will take care of extra white spaces around the columns @@ -1432,7 +1389,7 @@ is whitespace). .. ipython:: python - df = pd.read_fwf('bar.csv', header=None, index_col=0) + df = pd.read_fwf("bar.csv", header=None, index_col=0) df ``read_fwf`` supports the ``dtype`` parameter for specifying the types of @@ -1440,13 +1397,13 @@ parsed columns to be different from the inferred type. .. ipython:: python - pd.read_fwf('bar.csv', header=None, index_col=0).dtypes - pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes + pd.read_fwf("bar.csv", header=None, index_col=0).dtypes + pd.read_fwf("bar.csv", header=None, dtype={2: "object"}).dtypes .. ipython:: python :suppress: - os.remove('bar.csv') + os.remove("bar.csv") Indexes @@ -1458,8 +1415,8 @@ Files with an "implicit" index column .. ipython:: python :suppress: - f = open('foo.csv', 'w') - f.write('A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f = open("foo.csv", "w") + f.write("A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") f.close() Consider a file with one less entry in the header than the number of data @@ -1467,27 +1424,27 @@ column: .. ipython:: python - print(open('foo.csv').read()) + print(open("foo.csv").read()) In this special case, ``read_csv`` assumes that the first column is to be used as the index of the ``DataFrame``: .. ipython:: python - pd.read_csv('foo.csv') + pd.read_csv("foo.csv") Note that the dates weren't automatically parsed. In that case you would need to do as before: .. ipython:: python - df = pd.read_csv('foo.csv', parse_dates=True) + df = pd.read_csv("foo.csv", parse_dates=True) df.index .. ipython:: python :suppress: - os.remove('foo.csv') + os.remove("foo.csv") Reading an index with a ``MultiIndex`` @@ -1499,7 +1456,7 @@ Suppose you have data indexed by two columns: .. ipython:: python - print(open('data/mindex_ex.csv').read()) + print(open("data/mindex_ex.csv").read()) The ``index_col`` argument to ``read_csv`` can take a list of column numbers to turn multiple columns into a ``MultiIndex`` for the index of the @@ -1523,10 +1480,11 @@ rows will skip the intervening rows. .. ipython:: python from pandas._testing import makeCustomDataframe as mkdf + df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - df.to_csv('mi.csv') - print(open('mi.csv').read()) - pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1]) + df.to_csv("mi.csv") + print(open("mi.csv").read()) + pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) ``read_csv`` is also able to interpret a more common format of multi-columns indices. @@ -1535,14 +1493,14 @@ of multi-columns indices. :suppress: data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12" - fh = open('mi2.csv', 'w') + fh = open("mi2.csv", "w") fh.write(data) fh.close() .. ipython:: python - print(open('mi2.csv').read()) - pd.read_csv('mi2.csv', header=[0, 1], index_col=0) + print(open("mi2.csv").read()) + pd.read_csv("mi2.csv", header=[0, 1], index_col=0) Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will be *lost*. @@ -1550,8 +1508,8 @@ with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index wi .. ipython:: python :suppress: - os.remove('mi.csv') - os.remove('mi2.csv') + os.remove("mi.csv") + os.remove("mi2.csv") .. _io.sniff: @@ -1566,13 +1524,13 @@ class of the csv module. For this, you have to specify ``sep=None``. :suppress: df = pd.DataFrame(np.random.randn(10, 4)) - df.to_csv('tmp.sv', sep='|') - df.to_csv('tmp2.sv', sep=':') + df.to_csv("tmp.sv", sep="|") + df.to_csv("tmp2.sv", sep=":") .. ipython:: python - print(open('tmp2.sv').read()) - pd.read_csv('tmp2.sv', sep=None, engine='python') + print(open("tmp2.sv").read()) + pd.read_csv("tmp2.sv", sep=None, engine="python") .. _io.multiple_files: @@ -1593,8 +1551,8 @@ rather than reading the entire file into memory, such as the following: .. ipython:: python - print(open('tmp.sv').read()) - table = pd.read_csv('tmp.sv', sep='|') + print(open("tmp.sv").read()) + table = pd.read_csv("tmp.sv", sep="|") table @@ -1603,7 +1561,7 @@ value will be an iterable object of type ``TextFileReader``: .. ipython:: python - reader = pd.read_csv('tmp.sv', sep='|', chunksize=4) + reader = pd.read_csv("tmp.sv", sep="|", chunksize=4) reader for chunk in reader: @@ -1614,14 +1572,14 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: .. ipython:: python - reader = pd.read_csv('tmp.sv', sep='|', iterator=True) + reader = pd.read_csv("tmp.sv", sep="|", iterator=True) reader.get_chunk(5) .. ipython:: python :suppress: - os.remove('tmp.sv') - os.remove('tmp2.sv') + os.remove("tmp.sv") + os.remove("tmp2.sv") Specifying the parser engine '''''''''''''''''''''''''''' @@ -1649,8 +1607,7 @@ functions - the following example shows reading a CSV file: .. code-block:: python - df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', - sep='\t') + df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t") All URLs which are not local files or HTTP(s) are handled by `fsspec`_, if installed, and its various filesystem implementations @@ -1662,7 +1619,7 @@ S3 URLs require the `s3fs .. code-block:: python - df = pd.read_json('s3://pandas-test/adatafile.json') + df = pd.read_json("s3://pandas-test/adatafile.json") When dealing with remote storage systems, you might need extra configuration with environment variables or config files in @@ -1683,9 +1640,11 @@ specifying an anonymous connection, such as .. code-block:: python - pd.read_csv("s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" - "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", - storage_options={"anon": True}) + pd.read_csv( + "s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" + "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"anon": True}, + ) ``fsspec`` also allows complex URLs, for accessing data in compressed archives, local caching of files, and more. To locally cache the above @@ -1693,9 +1652,11 @@ example, you would modify the call to .. code-block:: python - pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" - "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", - storage_options={"s3": {"anon": True}}) + pd.read_csv( + "simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" + "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"s3": {"anon": True}}, + ) where we specify that the "anon" parameter is meant for the "s3" part of the implementation, not to the caching implementation. Note that this caches to a temporary @@ -1819,7 +1780,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet .. ipython:: python - dfj = pd.DataFrame(np.random.randn(5, 2), columns=list('AB')) + dfj = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) json = dfj.to_json() json @@ -1831,10 +1792,13 @@ file / string. Consider the following ``DataFrame`` and ``Series``: .. ipython:: python - dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), - columns=list('ABC'), index=list('xyz')) + dfjo = pd.DataFrame( + dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), + columns=list("ABC"), + index=list("xyz"), + ) dfjo - sjo = pd.Series(dict(x=15, y=16, z=17), name='D') + sjo = pd.Series(dict(x=15, y=16, z=17), name="D") sjo **Column oriented** (the default for ``DataFrame``) serializes the data as @@ -1894,24 +1858,24 @@ Writing in ISO date format: .. ipython:: python - dfd = pd.DataFrame(np.random.randn(5, 2), columns=list('AB')) - dfd['date'] = pd.Timestamp('20130101') + dfd = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) + dfd["date"] = pd.Timestamp("20130101") dfd = dfd.sort_index(1, ascending=False) - json = dfd.to_json(date_format='iso') + json = dfd.to_json(date_format="iso") json Writing in ISO date format, with microseconds: .. ipython:: python - json = dfd.to_json(date_format='iso', date_unit='us') + json = dfd.to_json(date_format="iso", date_unit="us") json Epoch timestamps, in seconds: .. ipython:: python - json = dfd.to_json(date_format='epoch', date_unit='s') + json = dfd.to_json(date_format="epoch", date_unit="s") json Writing to a file, with a date index and a date column: @@ -1919,13 +1883,13 @@ Writing to a file, with a date index and a date column: .. ipython:: python dfj2 = dfj.copy() - dfj2['date'] = pd.Timestamp('20130101') - dfj2['ints'] = list(range(5)) - dfj2['bools'] = True - dfj2.index = pd.date_range('20130101', periods=5) - dfj2.to_json('test.json') + dfj2["date"] = pd.Timestamp("20130101") + dfj2["ints"] = list(range(5)) + dfj2["bools"] = True + dfj2.index = pd.date_range("20130101", periods=5) + dfj2.to_json("test.json") - with open('test.json') as fh: + with open("test.json") as fh: print(fh.read()) Fallback behavior @@ -2060,26 +2024,27 @@ Reading from a file: .. ipython:: python - pd.read_json('test.json') + pd.read_json("test.json") Don't convert any data (but still convert axes and dates): .. ipython:: python - pd.read_json('test.json', dtype=object).dtypes + pd.read_json("test.json", dtype=object).dtypes Specify dtypes for conversion: .. ipython:: python - pd.read_json('test.json', dtype={'A': 'float32', 'bools': 'int8'}).dtypes + pd.read_json("test.json", dtype={"A": "float32", "bools": "int8"}).dtypes Preserve string indices: .. ipython:: python - si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), - index=[str(i) for i in range(4)]) + si = pd.DataFrame( + np.zeros((4, 4)), columns=list(range(4)), index=[str(i) for i in range(4)] + ) si si.index si.columns @@ -2094,10 +2059,10 @@ Dates written in nanoseconds need to be read back in nanoseconds: .. ipython:: python - json = dfj2.to_json(date_unit='ns') + json = dfj2.to_json(date_unit="ns") # Try to parse timestamps as milliseconds -> Won't Work - dfju = pd.read_json(json, date_unit='ms') + dfju = pd.read_json(json, date_unit="ms") dfju # Let pandas detect the correct precision @@ -2105,7 +2070,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: dfju # Or specify that all timestamps are in nanoseconds - dfju = pd.read_json(json, date_unit='ns') + dfju = pd.read_json(json, date_unit="ns") dfju The Numpy parameter @@ -2127,7 +2092,7 @@ data: randfloats = np.random.uniform(-100, 1000, 10000) randfloats.shape = (1000, 10) - dffloats = pd.DataFrame(randfloats, columns=list('ABCDEFGHIJ')) + dffloats = pd.DataFrame(randfloats, columns=list("ABCDEFGHIJ")) jsonfloats = dffloats.to_json() @@ -2174,7 +2139,7 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python :suppress: - os.remove('test.json') + os.remove("test.json") .. _io.json_normalize: @@ -2186,38 +2151,54 @@ into a flat table. .. ipython:: python - data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, - {'name': {'given': 'Mose', 'family': 'Regner'}}, - {'id': 2, 'name': 'Faye Raker'}] + data = [ + {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + {"name": {"given": "Mose", "family": "Regner"}}, + {"id": 2, "name": "Faye Raker"}, + ] pd.json_normalize(data) .. ipython:: python - data = [{'state': 'Florida', - 'shortname': 'FL', - 'info': {'governor': 'Rick Scott'}, - 'county': [{'name': 'Dade', 'population': 12345}, - {'name': 'Broward', 'population': 40000}, - {'name': 'Palm Beach', 'population': 60000}]}, - {'state': 'Ohio', - 'shortname': 'OH', - 'info': {'governor': 'John Kasich'}, - 'county': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] - - pd.json_normalize(data, 'county', ['state', 'shortname', ['info', 'governor']]) + data = [ + { + "state": "Florida", + "shortname": "FL", + "info": {"governor": "Rick Scott"}, + "county": [ + {"name": "Dade", "population": 12345}, + {"name": "Broward", "population": 40000}, + {"name": "Palm Beach", "population": 60000}, + ], + }, + { + "state": "Ohio", + "shortname": "OH", + "info": {"governor": "John Kasich"}, + "county": [ + {"name": "Summit", "population": 1234}, + {"name": "Cuyahoga", "population": 1337}, + ], + }, + ] + + pd.json_normalize(data, "county", ["state", "shortname", ["info", "governor"]]) The max_level parameter provides more control over which level to end normalization. With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict. .. ipython:: python - data = [{'CreatedBy': {'Name': 'User001'}, - 'Lookup': {'TextField': 'Some text', - 'UserField': {'Id': 'ID001', - 'Name': 'Name001'}}, - 'Image': {'a': 'b'} - }] + data = [ + { + "CreatedBy": {"Name": "User001"}, + "Lookup": { + "TextField": "Some text", + "UserField": {"Id": "ID001", "Name": "Name001"}, + }, + "Image": {"a": "b"}, + } + ] pd.json_normalize(data, max_level=1) .. _io.jsonl: @@ -2232,13 +2213,13 @@ For line-delimited json files, pandas can also return an iterator which reads in .. ipython:: python - jsonl = ''' + jsonl = """ {"a": 1, "b": 2} {"a": 3, "b": 4} - ''' + """ df = pd.read_json(jsonl, lines=True) df - df.to_json(orient='records', lines=True) + df.to_json(orient="records", lines=True) # reader is an iterator that returns ``chunksize`` lines each iteration reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) @@ -2258,12 +2239,16 @@ a JSON string with two fields, ``schema`` and ``data``. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) + df = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["a", "b", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=3), + }, + index=pd.Index(range(3), name="idx"), + ) df - df.to_json(orient='table', date_format="iso") + df.to_json(orient="table", date_format="iso") The ``schema`` field contains the ``fields`` key, which itself contains a list of column name to type pairs, including the ``Index`` or ``MultiIndex`` @@ -2302,7 +2287,8 @@ A few notes on the generated table schema: .. ipython:: python from pandas.io.json import build_table_schema - s = pd.Series(pd.date_range('2016', periods=4)) + + s = pd.Series(pd.date_range("2016", periods=4)) build_table_schema(s) * datetimes with a timezone (before serializing), include an additional field @@ -2310,8 +2296,7 @@ A few notes on the generated table schema: .. ipython:: python - s_tz = pd.Series(pd.date_range('2016', periods=12, - tz='US/Central')) + s_tz = pd.Series(pd.date_range("2016", periods=12, tz="US/Central")) build_table_schema(s_tz) * Periods are converted to timestamps before serialization, and so have the @@ -2320,8 +2305,7 @@ A few notes on the generated table schema: .. ipython:: python - s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC', - periods=4)) + s_per = pd.Series(1, index=pd.period_range("2016", freq="A-DEC", periods=4)) build_table_schema(s_per) * Categoricals use the ``any`` type and an ``enum`` constraint listing @@ -2329,7 +2313,7 @@ A few notes on the generated table schema: .. ipython:: python - s_cat = pd.Series(pd.Categorical(['a', 'b', 'a'])) + s_cat = pd.Series(pd.Categorical(["a", "b", "a"])) build_table_schema(s_cat) * A ``primaryKey`` field, containing an array of labels, is included @@ -2345,8 +2329,7 @@ A few notes on the generated table schema: .. ipython:: python - s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'), - (0, 1)])) + s_multi = pd.Series(1, index=pd.MultiIndex.from_product([("a", "b"), (0, 1)])) build_table_schema(s_multi) * The default naming roughly follows these rules: @@ -2366,16 +2349,20 @@ round-trippable manner. .. ipython:: python - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c']) - }, index=pd.Index(range(4), name='idx')) + df = pd.DataFrame( + { + "foo": [1, 2, 3, 4], + "bar": ["a", "b", "c", "d"], + "baz": pd.date_range("2018-01-01", freq="d", periods=4), + "qux": pd.Categorical(["a", "b", "c", "c"]), + }, + index=pd.Index(range(4), name="idx"), + ) df df.dtypes - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') + df.to_json("test.json", orient="table") + new_df = pd.read_json("test.json", orient="table") new_df new_df.dtypes @@ -2387,15 +2374,15 @@ indicate missing values and the subsequent read cannot distinguish the intent. .. ipython:: python :okwarning: - df.index.name = 'index' - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') + df.index.name = "index" + df.to_json("test.json", orient="table") + new_df = pd.read_json("test.json", orient="table") print(new_df.index.name) .. ipython:: python :suppress: - os.remove('test.json') + os.remove("test.json") .. _Table Schema: https://specs.frictionlessdata.io/table-schema/ @@ -2425,7 +2412,7 @@ Read a URL with no options: .. ipython:: python - url = 'https://www.fdic.gov/bank/individual/failed/banklist.html' + url = "https://www.fdic.gov/bank/individual/failed/banklist.html" dfs = pd.read_html(url) dfs @@ -2440,11 +2427,11 @@ as a string: .. ipython:: python :suppress: - file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) + file_path = os.path.abspath(os.path.join("source", "_static", "banklist.html")) .. ipython:: python - with open(file_path, 'r') as f: + with open(file_path, "r") as f: dfs = pd.read_html(f.read()) dfs @@ -2452,7 +2439,7 @@ You can even pass in an instance of ``StringIO`` if you so desire: .. ipython:: python - with open(file_path, 'r') as f: + with open(file_path, "r") as f: sio = StringIO(f.read()) dfs = pd.read_html(sio) @@ -2471,7 +2458,7 @@ Read a URL and match a table that contains specific text: .. code-block:: python - match = 'Metcalf Bank' + match = "Metcalf Bank" df_list = pd.read_html(url, match=match) Specify a header row (by default ```` or ```` elements located within a @@ -2506,15 +2493,15 @@ Specify an HTML attribute: .. code-block:: python - dfs1 = pd.read_html(url, attrs={'id': 'table'}) - dfs2 = pd.read_html(url, attrs={'class': 'sortable'}) + dfs1 = pd.read_html(url, attrs={"id": "table"}) + dfs2 = pd.read_html(url, attrs={"class": "sortable"}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True Specify values that should be converted to NaN: .. code-block:: python - dfs = pd.read_html(url, na_values=['No Acquirer']) + dfs = pd.read_html(url, na_values=["No Acquirer"]) Specify whether to keep the default set of NaN values: @@ -2529,22 +2516,21 @@ columns to strings. .. code-block:: python - url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' - dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, - converters={'MNC': str}) + url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code" + dfs = pd.read_html(url_mcc, match="Telekom Albania", header=0, converters={"MNC": str}) Use some combination of the above: .. code-block:: python - dfs = pd.read_html(url, match='Metcalf Bank', index_col=0) + dfs = pd.read_html(url, match="Metcalf Bank", index_col=0) Read in pandas ``to_html`` output (with some loss of floating point precision): .. code-block:: python df = pd.DataFrame(np.random.randn(2, 2)) - s = df.to_html(float_format='{0:.40g}'.format) + s = df.to_html(float_format="{0:.40g}".format) dfin = pd.read_html(s, index_col=0) The ``lxml`` backend will raise an error on a failed parse if that is the only @@ -2554,13 +2540,13 @@ for example, the function expects a sequence of strings. You may use: .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml"]) Or you could pass ``flavor='lxml'`` without a list: .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor="lxml") However, if you have bs4 and html5lib installed and pass ``None`` or ``['lxml', 'bs4']`` then the parse will most likely succeed. Note that *as soon as a parse @@ -2568,7 +2554,7 @@ succeeds, the function will return*. .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) .. _io.html: @@ -2590,8 +2576,8 @@ in the method ``to_string`` described above. :suppress: def write_html(df, filename, *args, **kwargs): - static = os.path.abspath(os.path.join('source', '_static')) - with open(os.path.join(static, filename + '.html'), 'w') as f: + static = os.path.abspath(os.path.join("source", "_static")) + with open(os.path.join(static, filename + ".html"), "w") as f: df.to_html(f, *args, **kwargs) .. ipython:: python @@ -2603,7 +2589,7 @@ in the method ``to_string`` described above. .. ipython:: python :suppress: - write_html(df, 'basic') + write_html(df, "basic") HTML: @@ -2619,7 +2605,7 @@ The ``columns`` argument will limit the columns shown: .. ipython:: python :suppress: - write_html(df, 'columns', columns=[0]) + write_html(df, "columns", columns=[0]) HTML: @@ -2631,12 +2617,12 @@ point values: .. ipython:: python - print(df.to_html(float_format='{0:.10f}'.format)) + print(df.to_html(float_format="{0:.10f}".format)) .. ipython:: python :suppress: - write_html(df, 'float_format', float_format='{0:.10f}'.format) + write_html(df, "float_format", float_format="{0:.10f}".format) HTML: @@ -2653,7 +2639,7 @@ off: .. ipython:: python :suppress: - write_html(df, 'nobold', bold_rows=False) + write_html(df, "nobold", bold_rows=False) .. raw:: html :file: ../_static/nobold.html @@ -2664,7 +2650,7 @@ table CSS classes. Note that these classes are *appended* to the existing .. ipython:: python - print(df.to_html(classes=['awesome_table_class', 'even_more_awesome_class'])) + print(df.to_html(classes=["awesome_table_class", "even_more_awesome_class"])) The ``render_links`` argument provides the ability to add hyperlinks to cells that contain URLs. @@ -2673,15 +2659,18 @@ that contain URLs. .. ipython:: python - url_df = pd.DataFrame({ - 'name': ['Python', 'Pandas'], - 'url': ['https://www.python.org/', 'https://pandas.pydata.org']}) + url_df = pd.DataFrame( + { + "name": ["Python", "Pandas"], + "url": ["https://www.python.org/", "https://pandas.pydata.org"], + } + ) print(url_df.to_html(render_links=True)) .. ipython:: python :suppress: - write_html(url_df, 'render_links', render_links=True) + write_html(url_df, "render_links", render_links=True) HTML: @@ -2694,14 +2683,14 @@ Finally, the ``escape`` argument allows you to control whether the .. ipython:: python - df = pd.DataFrame({'a': list('&<>'), 'b': np.random.randn(3)}) + df = pd.DataFrame({"a": list("&<>"), "b": np.random.randn(3)}) .. ipython:: python :suppress: - write_html(df, 'escape') - write_html(df, 'noescape', escape=False) + write_html(df, "escape") + write_html(df, "noescape", escape=False) Escaped: @@ -2828,7 +2817,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', sheet_name='Sheet1') + pd.read_excel("path_to_file.xls", sheet_name="Sheet1") .. _io.excel.excelfile_class: @@ -2843,16 +2832,16 @@ read into memory only once. .. code-block:: python - xlsx = pd.ExcelFile('path_to_file.xls') - df = pd.read_excel(xlsx, 'Sheet1') + xlsx = pd.ExcelFile("path_to_file.xls") + df = pd.read_excel(xlsx, "Sheet1") The ``ExcelFile`` class can also be used as a context manager. .. code-block:: python - with pd.ExcelFile('path_to_file.xls') as xls: - df1 = pd.read_excel(xls, 'Sheet1') - df2 = pd.read_excel(xls, 'Sheet2') + with pd.ExcelFile("path_to_file.xls") as xls: + df1 = pd.read_excel(xls, "Sheet1") + df2 = pd.read_excel(xls, "Sheet2") The ``sheet_names`` property will generate a list of the sheet names in the file. @@ -2864,10 +2853,9 @@ different parameters: data = {} # For when Sheet1's format differs from Sheet2 - with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=1) + with pd.ExcelFile("path_to_file.xls") as xls: + data["Sheet1"] = pd.read_excel(xls, "Sheet1", index_col=None, na_values=["NA"]) + data["Sheet2"] = pd.read_excel(xls, "Sheet2", index_col=1) Note that if the same parsing parameters are used for all sheets, a list of sheet names can simply be passed to ``read_excel`` with no loss in performance. @@ -2876,15 +2864,14 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # using the ExcelFile class data = {} - with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=None, - na_values=['NA']) + with pd.ExcelFile("path_to_file.xls") as xls: + data["Sheet1"] = pd.read_excel(xls, "Sheet1", index_col=None, na_values=["NA"]) + data["Sheet2"] = pd.read_excel(xls, "Sheet2", index_col=None, na_values=["NA"]) # equivalent using the read_excel function - data = pd.read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], - index_col=None, na_values=['NA']) + data = pd.read_excel( + "path_to_file.xls", ["Sheet1", "Sheet2"], index_col=None, na_values=["NA"] + ) ``ExcelFile`` can also be called with a ``xlrd.book.Book`` object as a parameter. This allows the user to control how the excel file is read. @@ -2894,10 +2881,11 @@ with ``on_demand=True``. .. code-block:: python import xlrd - xlrd_book = xlrd.open_workbook('path_to_file.xls', on_demand=True) + + xlrd_book = xlrd.open_workbook("path_to_file.xls", on_demand=True) with pd.ExcelFile(xlrd_book) as xls: - df1 = pd.read_excel(xls, 'Sheet1') - df2 = pd.read_excel(xls, 'Sheet2') + df1 = pd.read_excel(xls, "Sheet1") + df2 = pd.read_excel(xls, "Sheet2") .. _io.excel.specifying_sheets: @@ -2919,35 +2907,35 @@ Specifying sheets .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"]) Using the sheet index: .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + pd.read_excel("path_to_file.xls", 0, index_col=None, na_values=["NA"]) Using all default values: .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls') + pd.read_excel("path_to_file.xls") Using None to get all sheets: .. code-block:: python # Returns a dictionary of DataFrames - pd.read_excel('path_to_file.xls', sheet_name=None) + pd.read_excel("path_to_file.xls", sheet_name=None) Using a list to get multiple sheets: .. code-block:: python # Returns the 1st and 4th sheet, as a dictionary of DataFrames. - pd.read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) + pd.read_excel("path_to_file.xls", sheet_name=["Sheet1", 3]) ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. @@ -2968,10 +2956,12 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, - index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']])) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) + df = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([["a", "b"], ["c", "d"]]), + ) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df If the index has level names, they will parsed as well, using the same @@ -2979,9 +2969,9 @@ parameters. .. ipython:: python - df.index = df.index.set_names(['lvl1', 'lvl2']) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) + df.index = df.index.set_names(["lvl1", "lvl2"]) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df @@ -2990,16 +2980,15 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], - names=['c1', 'c2']) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) + df.columns = pd.MultiIndex.from_product([["a"], ["b", "d"]], names=["c1", "c2"]) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], header=[0, 1]) df .. ipython:: python :suppress: - os.remove('path_to_file.xlsx') + os.remove("path_to_file.xlsx") Parsing specific columns @@ -3018,14 +3007,14 @@ You can specify a comma-delimited set of Excel columns and ranges as a string: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') + pd.read_excel("path_to_file.xls", "Sheet1", usecols="A,C:E") If ``usecols`` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=[0, 2, 3]) Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. @@ -3037,7 +3026,7 @@ document header row(s). Those strings define which columns will be parsed: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=["foo", "bar"]) Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. @@ -3048,7 +3037,7 @@ the column names, returning names where the callable function evaluates to ``Tru .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=lambda x: x.isalpha()) Parsing dates +++++++++++++ @@ -3060,7 +3049,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + pd.read_excel("path_to_file.xls", "Sheet1", parse_dates=["date_strings"]) Cell converters @@ -3071,7 +3060,7 @@ option. For instance, to convert a column to boolean: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + pd.read_excel("path_to_file.xls", "Sheet1", converters={"MyBools": bool}) This options handles missing values and treats exceptions in the converters as missing data. Transformations are applied cell by cell rather than to the @@ -3086,7 +3075,7 @@ missing data to recover integer dtype: return int(x) if x else -1 - pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + pd.read_excel("path_to_file.xls", "Sheet1", converters={"MyInts": cfun}) Dtype specifications ++++++++++++++++++++ @@ -3098,7 +3087,7 @@ no type inference, use the type ``str`` or ``object``. .. code-block:: python - pd.read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + pd.read_excel("path_to_file.xls", dtype={"MyInts": "int64", "MyText": str}) .. _io.excel_writer: @@ -3116,7 +3105,7 @@ written. For example: .. code-block:: python - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1") Files with a ``.xls`` extension will be written using ``xlwt`` and those with a ``.xlsx`` extension will be written using ``xlsxwriter`` (if available) or @@ -3129,16 +3118,16 @@ row instead of the first. You can place it in the first row by setting the .. code-block:: python - df.to_excel('path_to_file.xlsx', index_label='label', merge_cells=False) + df.to_excel("path_to_file.xlsx", index_label="label", merge_cells=False) In order to write separate ``DataFrames`` to separate sheets in a single Excel file, one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python - with pd.ExcelWriter('path_to_file.xlsx') as writer: - df1.to_excel(writer, sheet_name='Sheet1') - df2.to_excel(writer, sheet_name='Sheet2') + with pd.ExcelWriter("path_to_file.xlsx") as writer: + df1.to_excel(writer, sheet_name="Sheet1") + df2.to_excel(writer, sheet_name="Sheet2") .. note:: @@ -3164,8 +3153,8 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` bio = BytesIO() # By setting the 'engine' in the ExcelWriter constructor. - writer = pd.ExcelWriter(bio, engine='xlsxwriter') - df.to_excel(writer, sheet_name='Sheet1') + writer = pd.ExcelWriter(bio, engine="xlsxwriter") + df.to_excel(writer, sheet_name="Sheet1") # Save the workbook writer.save() @@ -3214,16 +3203,17 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: .. code-block:: python # By setting the 'engine' in the DataFrame 'to_excel()' methods. - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1", engine="xlsxwriter") # By setting the 'engine' in the ExcelWriter constructor. - writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter("path_to_file.xlsx", engine="xlsxwriter") # Or via pandas configuration. from pandas import options # noqa: E402 - options.io.excel.xlsx.writer = 'xlsxwriter' - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + options.io.excel.xlsx.writer = "xlsxwriter" + + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1") .. _io.excel.style: @@ -3254,7 +3244,7 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.ods', engine='odf') + pd.read_excel("path_to_file.ods", engine="odf") .. note:: @@ -3277,7 +3267,7 @@ in files and will return floats instead. .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xlsb', engine='pyxlsb') + pd.read_excel("path_to_file.xlsb", engine="pyxlsb") .. note:: @@ -3353,7 +3343,7 @@ All pandas objects are equipped with ``to_pickle`` methods which use Python's .. ipython:: python df - df.to_pickle('foo.pkl') + df.to_pickle("foo.pkl") The ``read_pickle`` function in the ``pandas`` namespace can be used to load any pickled pandas object (or any other pickled object) from file: @@ -3361,12 +3351,12 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python - pd.read_pickle('foo.pkl') + pd.read_pickle("foo.pkl") .. ipython:: python :suppress: - os.remove('foo.pkl') + os.remove("foo.pkl") .. warning:: @@ -3400,10 +3390,13 @@ the underlying compression library. .. ipython:: python - df = pd.DataFrame({ - 'A': np.random.randn(1000), - 'B': 'foo', - 'C': pd.date_range('20130101', periods=1000, freq='s')}) + df = pd.DataFrame( + { + "A": np.random.randn(1000), + "B": "foo", + "C": pd.date_range("20130101", periods=1000, freq="s"), + } + ) df Using an explicit compression type: @@ -3438,10 +3431,7 @@ Passing options to the compression protocol in order to speed up compression: .. ipython:: python - df.to_pickle( - "data.pkl.gz", - compression={"method": "gzip", 'compresslevel': 1} - ) + df.to_pickle("data.pkl.gz", compression={"method": "gzip", "compresslevel": 1}) .. ipython:: python :suppress: @@ -3462,11 +3452,13 @@ Example pyarrow usage: .. code-block:: python - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame({'A': [1, 2, 3]}) - >>> context = pa.default_serialization_context() - >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes() + import pandas as pd + import pyarrow as pa + + df = pd.DataFrame({"A": [1, 2, 3]}) + + context = pa.default_serialization_context() + df_bytestring = context.serialize(df).to_buffer().to_pybytes() For documentation on pyarrow, see `here `__. @@ -3492,11 +3484,11 @@ for some advanced strategies :suppress: :okexcept: - os.remove('store.h5') + os.remove("store.h5") .. ipython:: python - store = pd.HDFStore('store.h5') + store = pd.HDFStore("store.h5") print(store) Objects can be written to the file just like adding key-value pairs to a @@ -3504,15 +3496,14 @@ dict: .. ipython:: python - index = pd.date_range('1/1/2000', periods=8) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = pd.DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.date_range("1/1/2000", periods=8) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) # store.put('s', s) is an equivalent method - store['s'] = s + store["s"] = s - store['df'] = df + store["df"] = df store @@ -3521,7 +3512,7 @@ In a current or later Python session, you can retrieve stored objects: .. ipython:: python # store.get('df') is an equivalent method - store['df'] + store["df"] # dotted (attribute) access provides get as well store.df @@ -3531,7 +3522,7 @@ Deletion of the object specified by the key: .. ipython:: python # store.remove('df') is an equivalent method - del store['df'] + del store["df"] store @@ -3544,14 +3535,14 @@ Closing a Store and using a context manager: store.is_open # Working with, and automatically closing the store using a context manager - with pd.HDFStore('store.h5') as store: + with pd.HDFStore("store.h5") as store: store.keys() .. ipython:: python :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") @@ -3563,15 +3554,15 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python - df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))}) - df_tl.to_hdf('store_tl.h5', 'table', append=True) - pd.read_hdf('store_tl.h5', 'table', where=['index>2']) + df_tl = pd.DataFrame({"A": list(range(5)), "B": list(range(5))}) + df_tl.to_hdf("store_tl.h5", "table", append=True) + pd.read_hdf("store_tl.h5", "table", where=["index>2"]) .. ipython:: python :suppress: :okexcept: - os.remove('store_tl.h5') + os.remove("store_tl.h5") HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting ``dropna=True``. @@ -3579,24 +3570,23 @@ HDFStore will by default not drop rows that are all missing. This behavior can b .. ipython:: python - df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2], - 'col2': [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) df_with_missing - df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w') + df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") - pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf("file.h5", "df_with_missing") - df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w', dropna=True) - pd.read_hdf('file.h5', 'df_with_missing') + df_with_missing.to_hdf( + "file.h5", "df_with_missing", format="table", mode="w", dropna=True + ) + pd.read_hdf("file.h5", "df_with_missing") .. ipython:: python :suppress: - os.remove('file.h5') + os.remove("file.h5") .. _io.hdf5-fixed: @@ -3642,21 +3632,21 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format. :suppress: :okexcept: - os.remove('store.h5') + os.remove("store.h5") .. ipython:: python - store = pd.HDFStore('store.h5') + store = pd.HDFStore("store.h5") df1 = df[0:4] df2 = df[4:] # append data (creates a table automatically) - store.append('df', df1) - store.append('df', df2) + store.append("df", df1) + store.append("df", df2) store # select the entire object - store.select('df') + store.select("df") # the type of stored data store.root.df._v_attrs.pandas_type @@ -3679,16 +3669,16 @@ everything in the sub-store and **below**, so be *careful*. .. ipython:: python - store.put('foo/bar/bah', df) - store.append('food/orange', df) - store.append('food/apple', df) + store.put("foo/bar/bah", df) + store.append("food/orange", df) + store.append("food/apple", df) store # a list of keys are returned store.keys() # remove all nodes under this level - store.remove('food') + store.remove("food") store @@ -3702,10 +3692,10 @@ will yield a tuple for each group key along with the relative keys of its conten for (path, subgroups, subkeys) in store.walk(): for subgroup in subgroups: - print('GROUP: {}/{}'.format(path, subgroup)) + print("GROUP: {}/{}".format(path, subgroup)) for subkey in subkeys: - key = '/'.join([path, subkey]) - print('KEY: {}'.format(key)) + key = "/".join([path, subkey]) + print("KEY: {}".format(key)) print(store.get(key)) @@ -3729,7 +3719,7 @@ will yield a tuple for each group key along with the relative keys of its conten .. ipython:: python - store['foo/bar/bah'] + store["foo/bar/bah"] .. _io.hdf5-types: @@ -3753,19 +3743,22 @@ defaults to ``nan``. .. ipython:: python - df_mixed = pd.DataFrame({'A': np.random.randn(8), - 'B': np.random.randn(8), - 'C': np.array(np.random.randn(8), dtype='float32'), - 'string': 'string', - 'int': 1, - 'bool': True, - 'datetime64': pd.Timestamp('20010102')}, - index=list(range(8))) - df_mixed.loc[df_mixed.index[3:5], - ['A', 'B', 'string', 'datetime64']] = np.nan + df_mixed = pd.DataFrame( + { + "A": np.random.randn(8), + "B": np.random.randn(8), + "C": np.array(np.random.randn(8), dtype="float32"), + "string": "string", + "int": 1, + "bool": True, + "datetime64": pd.Timestamp("20010102"), + }, + index=list(range(8)), + ) + df_mixed.loc[df_mixed.index[3:5], ["A", "B", "string", "datetime64"]] = np.nan - store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) - df_mixed1 = store.select('df_mixed') + store.append("df_mixed", df_mixed, min_itemsize={"values": 50}) + df_mixed1 = store.select("df_mixed") df_mixed1 df_mixed1.dtypes.value_counts() @@ -3780,20 +3773,19 @@ storing/selecting from homogeneous index ``DataFrames``. .. ipython:: python - index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) df_mi - store.append('df_mi', df_mi) - store.select('df_mi') + store.append("df_mi", df_mi) + store.select("df_mi") # the levels are automatically included as data columns - store.select('df_mi', 'foo=bar') + store.select("df_mi", "foo=bar") .. note:: The ``index`` keyword is reserved and cannot be use as a level name. @@ -3870,7 +3862,7 @@ The right-hand side of the sub-expression (after a comparison operator) can be: .. code-block:: python string = "HolyMoly'" - store.select('df', 'index == string') + store.select("df", "index == string") instead of this @@ -3887,7 +3879,7 @@ The right-hand side of the sub-expression (after a comparison operator) can be: .. code-block:: python - store.select('df', 'index == %r' % string) + store.select("df", "index == %r" % string) which will quote ``string``. @@ -3896,21 +3888,24 @@ Here are some examples: .. ipython:: python - dfq = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'), - index=pd.date_range('20130101', periods=10)) - store.append('dfq', dfq, format='table', data_columns=True) + dfq = pd.DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=pd.date_range("20130101", periods=10), + ) + store.append("dfq", dfq, format="table", data_columns=True) Use boolean expressions, with in-line function evaluation. .. ipython:: python - store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']") + store.select("dfq", "index>pd.Timestamp('20130104') & columns=['A', 'B']") Use inline column reference. .. ipython:: python - store.select('dfq', where="A>0 or C>0") + store.select("dfq", where="A>0 or C>0") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3918,7 +3913,7 @@ returned, this is equivalent to passing a .. ipython:: python - store.select('df', "columns=['A', 'B']") + store.select("df", "columns=['A', 'B']") ``start`` and ``stop`` parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. @@ -3944,14 +3939,19 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame({'A': pd.Timestamp('20130101'), - 'B': [pd.Timestamp('20130101') + timedelta(days=i, - seconds=10) - for i in range(10)]}) - dftd['C'] = dftd['A'] - dftd['B'] + + dftd = pd.DataFrame( + { + "A": pd.Timestamp("20130101"), + "B": [ + pd.Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) + ], + } + ) + dftd["C"] = dftd["A"] - dftd["B"] dftd - store.append('dftd', dftd, data_columns=True) - store.select('dftd', "C<'-3.5D'") + store.append("dftd", dftd, data_columns=True) + store.select("dftd", "C<'-3.5D'") .. _io.query_multi: @@ -3963,7 +3963,7 @@ Selecting from a ``MultiIndex`` can be achieved by using the name of the level. .. ipython:: python df_mi.index.names - store.select('df_mi', "foo=baz and bar=two") + store.select("df_mi", "foo=baz and bar=two") If the ``MultiIndex`` levels names are ``None``, the levels are automatically made available via the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to select from. @@ -3974,8 +3974,7 @@ the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to s levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], ) - df_mi_2 = pd.DataFrame(np.random.randn(10, 3), - index=index, columns=["A", "B", "C"]) + df_mi_2 = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) df_mi_2 store.append("df_mi_2", df_mi_2) @@ -4006,7 +4005,7 @@ indexed dimension as the ``where``. i.optlevel, i.kind # change an index by passing new parameters - store.create_table_index('df', optlevel=9, kind='full') + store.create_table_index("df", optlevel=9, kind="full") i = store.root.df.table.cols.index.index i.optlevel, i.kind @@ -4014,20 +4013,20 @@ Oftentimes when appending large amounts of data to a store, it is useful to turn .. ipython:: python - df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) - df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) + df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) + df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) - st = pd.HDFStore('appends.h5', mode='w') - st.append('df', df_1, data_columns=['B'], index=False) - st.append('df', df_2, data_columns=['B'], index=False) - st.get_storer('df').table + st = pd.HDFStore("appends.h5", mode="w") + st.append("df", df_1, data_columns=["B"], index=False) + st.append("df", df_2, data_columns=["B"], index=False) + st.get_storer("df").table Then create the index when finished appending. .. ipython:: python - st.create_table_index('df', columns=['B'], optlevel=9, kind='full') - st.get_storer('df').table + st.create_table_index("df", columns=["B"], optlevel=9, kind="full") + st.get_storer("df").table st.close() @@ -4035,7 +4034,7 @@ Then create the index when finished appending. :suppress: :okexcept: - os.remove('appends.h5') + os.remove("appends.h5") See `here `__ for how to create a completely-sorted-index (CSI) on an existing store. @@ -4054,22 +4053,22 @@ be ``data_columns``. .. ipython:: python df_dc = df.copy() - df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4:6], 'string'] = np.nan - df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' - df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0 + df_dc["string"] = "foo" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" + df_dc["string2"] = "cool" + df_dc.loc[df_dc.index[1:3], ["B", "C"]] = 1.0 df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2']) - store.select('df_dc', where='B > 0') + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + store.select("df_dc", where="B > 0") # getting creative - store.select('df_dc', 'B > 0 & C > 0 & string == foo') + store.select("df_dc", "B > 0 & C > 0 & string == foo") # this is in-memory version of this type of selection - df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] # we have automagically created this index and the B/C/string/string2 # columns are stored separately as ``PyTables`` columns @@ -4090,7 +4089,7 @@ The default is 50,000 rows returned in a chunk. .. ipython:: python - for df in store.select('df', chunksize=3): + for df in store.select("df", chunksize=3): print(df) .. note:: @@ -4100,7 +4099,7 @@ The default is 50,000 rows returned in a chunk. .. code-block:: python - for df in pd.read_hdf('store.h5', 'df', chunksize=3): + for df in pd.read_hdf("store.h5", "df", chunksize=3): print(df) Note, that the chunksize keyword applies to the **source** rows. So if you @@ -4112,18 +4111,20 @@ chunks. .. ipython:: python - dfeq = pd.DataFrame({'number': np.arange(1, 11)}) + dfeq = pd.DataFrame({"number": np.arange(1, 11)}) dfeq - store.append('dfeq', dfeq, data_columns=['number']) + store.append("dfeq", dfeq, data_columns=["number"]) + def chunks(l, n): - return [l[i:i + n] for i in range(0, len(l), n)] + return [l[i: i + n] for i in range(0, len(l), n)] + evens = [2, 4, 6, 8, 10] - coordinates = store.select_as_coordinates('dfeq', 'number=evens') + coordinates = store.select_as_coordinates("dfeq", "number=evens") for c in chunks(coordinates, 2): - print(store.select('dfeq', where=c)) + print(store.select("dfeq", where=c)) Advanced queries ++++++++++++++++ @@ -4138,8 +4139,8 @@ These do not currently accept the ``where`` selector. .. ipython:: python - store.select_column('df_dc', 'index') - store.select_column('df_dc', 'string') + store.select_column("df_dc", "index") + store.select_column("df_dc", "string") .. _io.hdf5-selecting_coordinates: @@ -4152,12 +4153,13 @@ Sometimes you want to get the coordinates (a.k.a the index locations) of your qu .. ipython:: python - df_coord = pd.DataFrame(np.random.randn(1000, 2), - index=pd.date_range('20000101', periods=1000)) - store.append('df_coord', df_coord) - c = store.select_as_coordinates('df_coord', 'index > 20020101') + df_coord = pd.DataFrame( + np.random.randn(1000, 2), index=pd.date_range("20000101", periods=1000) + ) + store.append("df_coord", df_coord) + c = store.select_as_coordinates("df_coord", "index > 20020101") c - store.select('df_coord', where=c) + store.select("df_coord", where=c) .. _io.hdf5-where_mask: @@ -4170,12 +4172,13 @@ a datetimeindex which are 5. .. ipython:: python - df_mask = pd.DataFrame(np.random.randn(1000, 2), - index=pd.date_range('20000101', periods=1000)) - store.append('df_mask', df_mask) - c = store.select_column('df_mask', 'index') + df_mask = pd.DataFrame( + np.random.randn(1000, 2), index=pd.date_range("20000101", periods=1000) + ) + store.append("df_mask", df_mask) + c = store.select_column("df_mask", "index") where = c[pd.DatetimeIndex(c).month == 5].index - store.select('df_mask', where=where) + store.select("df_mask", where=where) Storer object ^^^^^^^^^^^^^ @@ -4186,7 +4189,7 @@ of rows in an object. .. ipython:: python - store.get_storer('df_dc').nrows + store.get_storer("df_dc").nrows Multiple table queries @@ -4219,24 +4222,26 @@ results. .. ipython:: python - df_mt = pd.DataFrame(np.random.randn(8, 6), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) - df_mt['foo'] = 'bar' - df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan + df_mt = pd.DataFrame( + np.random.randn(8, 6), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C", "D", "E", "F"], + ) + df_mt["foo"] = "bar" + df_mt.loc[df_mt.index[1], ("A", "B")] = np.nan # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, - df_mt, selector='df1_mt') + store.append_to_multiple( + {"df1_mt": ["A", "B"], "df2_mt": None}, df_mt, selector="df1_mt" + ) store # individual tables were created - store.select('df1_mt') - store.select('df2_mt') + store.select("df1_mt") + store.select("df2_mt") # as a multiple - store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector='df1_mt') + store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") Delete from a table @@ -4345,14 +4350,15 @@ Enable compression for all objects within the file: .. code-block:: python - store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, - complib='blosc:blosclz') + store_compressed = pd.HDFStore( + "store_compressed.h5", complevel=9, complib="blosc:blosclz" + ) Or on-the-fly compression (this only applies to tables) in stores where compression is not enabled: .. code-block:: python - store.append('df', df, complib='zlib', complevel=5) + store.append("df", df, complib="zlib", complevel=5) .. _io.hdf5-ptrepack: @@ -4441,13 +4447,14 @@ stored in a more efficient manner. .. ipython:: python - dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), - 'B': np.random.randn(8)}) + dfcat = pd.DataFrame( + {"A": pd.Series(list("aabbcdba")).astype("category"), "B": np.random.randn(8)} + ) dfcat dfcat.dtypes - cstore = pd.HDFStore('cats.h5', mode='w') - cstore.append('dfcat', dfcat, format='table', data_columns=['A']) - result = cstore.select('dfcat', where="A in ['b', 'c']") + cstore = pd.HDFStore("cats.h5", mode="w") + cstore.append("dfcat", dfcat, format="table", data_columns=["A"]) + result = cstore.select("dfcat", where="A in ['b', 'c']") result result.dtypes @@ -4456,7 +4463,7 @@ stored in a more efficient manner. :okexcept: cstore.close() - os.remove('cats.h5') + os.remove("cats.h5") String columns @@ -4483,17 +4490,17 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5))) + dfs = pd.DataFrame({"A": "foo", "B": "bar"}, index=list(range(5))) dfs # A and B have a size of 30 - store.append('dfs', dfs, min_itemsize=30) - store.get_storer('dfs').table + store.append("dfs", dfs, min_itemsize=30) + store.get_storer("dfs").table # A is created as a data_column with a size of 30 # B is size is calculated - store.append('dfs2', dfs, min_itemsize={'A': 30}) - store.get_storer('dfs2').table + store.append("dfs2", dfs, min_itemsize={"A": 30}) + store.get_storer("dfs2").table **nan_rep** @@ -4502,15 +4509,15 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']}) + dfss = pd.DataFrame({"A": ["foo", "bar", "nan"]}) dfss - store.append('dfss', dfss) - store.select('dfss') + store.append("dfss", dfss) + store.select("dfss") # here you need to specify a different nan rep - store.append('dfss2', dfss, nan_rep='_nan_') - store.select('dfss2') + store.append("dfss2", dfss, nan_rep="_nan_") + store.select("dfss2") .. _io.external_compatibility: @@ -4529,21 +4536,25 @@ It is possible to write an ``HDFStore`` object that can easily be imported into .. ipython:: python - df_for_r = pd.DataFrame({"first": np.random.rand(100), - "second": np.random.rand(100), - "class": np.random.randint(0, 2, (100, ))}, - index=range(100)) + df_for_r = pd.DataFrame( + { + "first": np.random.rand(100), + "second": np.random.rand(100), + "class": np.random.randint(0, 2, (100,)), + }, + index=range(100), + ) df_for_r.head() - store_export = pd.HDFStore('export.h5') - store_export.append('df_for_r', df_for_r, data_columns=df_dc.columns) + store_export = pd.HDFStore("export.h5") + store_export.append("df_for_r", df_for_r, data_columns=df_dc.columns) store_export .. ipython:: python :suppress: store_export.close() - os.remove('export.h5') + os.remove("export.h5") In R this file can be read into a ``data.frame`` object using the ``rhdf5`` library. The following example function reads the corresponding column names @@ -4630,7 +4641,7 @@ Performance :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") .. _io.feather: @@ -4660,21 +4671,26 @@ See the `Full Documentation `__. :suppress: import warnings + # This can be removed once building with pyarrow >=0.15.0 warnings.filterwarnings("ignore", "The Sparse", FutureWarning) .. ipython:: python - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'i': pd.date_range('20130101', periods=3, freq='ns')}) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, freq="ns"), + } + ) df df.dtypes @@ -4683,13 +4699,13 @@ Write to a feather file. .. ipython:: python - df.to_feather('example.feather') + df.to_feather("example.feather") Read from a feather file. .. ipython:: python - result = pd.read_feather('example.feather') + result = pd.read_feather("example.feather") result # we preserve dtypes @@ -4698,7 +4714,7 @@ Read from a feather file. .. ipython:: python :suppress: - os.remove('example.feather') + os.remove("example.feather") .. _io.parquet: @@ -4743,15 +4759,19 @@ See the documentation for `pyarrow `__ an .. ipython:: python - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.date_range('20130101', periods=3), - 'g': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'h': pd.Categorical(list('abc')), - 'i': pd.Categorical(list('abc'), ordered=True)}) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.Categorical(list("abc")), + "i": pd.Categorical(list("abc"), ordered=True), + } + ) df df.dtypes @@ -4761,15 +4781,15 @@ Write to a parquet file. .. ipython:: python :okwarning: - df.to_parquet('example_pa.parquet', engine='pyarrow') - df.to_parquet('example_fp.parquet', engine='fastparquet') + df.to_parquet("example_pa.parquet", engine="pyarrow") + df.to_parquet("example_fp.parquet", engine="fastparquet") Read from a parquet file. .. ipython:: python - result = pd.read_parquet('example_fp.parquet', engine='fastparquet') - result = pd.read_parquet('example_pa.parquet', engine='pyarrow') + result = pd.read_parquet("example_fp.parquet", engine="fastparquet") + result = pd.read_parquet("example_pa.parquet", engine="pyarrow") result.dtypes @@ -4777,18 +4797,16 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet('example_fp.parquet', - engine='fastparquet', columns=['a', 'b']) - result = pd.read_parquet('example_pa.parquet', - engine='pyarrow', columns=['a', 'b']) + result = pd.read_parquet("example_fp.parquet", engine="fastparquet", columns=["a", "b"]) + result = pd.read_parquet("example_pa.parquet", engine="pyarrow", columns=["a", "b"]) result.dtypes .. ipython:: python :suppress: - os.remove('example_pa.parquet') - os.remove('example_fp.parquet') + os.remove("example_pa.parquet") + os.remove("example_fp.parquet") Handling indexes @@ -4799,8 +4817,8 @@ more columns in the output file. Thus, this code: .. ipython:: python - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - df.to_parquet('test.parquet', engine='pyarrow') + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.to_parquet("test.parquet", engine="pyarrow") creates a parquet file with *three* columns if you use ``pyarrow`` for serialization: ``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the @@ -4815,7 +4833,7 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to .. ipython:: python - df.to_parquet('test.parquet', index=False) + df.to_parquet("test.parquet", index=False) This creates a parquet file with just the two expected columns, ``a`` and ``b``. If your ``DataFrame`` has a custom index, you won't get it back when you load @@ -4827,7 +4845,7 @@ underlying engine's default behavior. .. ipython:: python :suppress: - os.remove('test.parquet') + os.remove("test.parquet") Partitioning Parquet files @@ -4839,9 +4857,8 @@ Parquet supports partitioning of data based on the values of one or more columns .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) - df.to_parquet(path='test', engine='pyarrow', - partition_cols=['a'], compression=None) + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}) + df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None) The ``path`` specifies the parent directory to which data will be saved. The ``partition_cols`` are the column names by which the dataset will be partitioned. @@ -4863,8 +4880,9 @@ The above example creates a partitioned dataset that may look like: :suppress: from shutil import rmtree + try: - rmtree('test') + rmtree("test") except OSError: pass @@ -4932,15 +4950,16 @@ below and the SQLAlchemy `documentation / # where is relative: - engine = create_engine('sqlite:///foo.db') + engine = create_engine("sqlite:///foo.db") # or absolute, starting with a slash: - engine = create_engine('sqlite:////absolute/path/to/foo.db') + engine = create_engine("sqlite:////absolute/path/to/foo.db") For more information see the examples the SQLAlchemy `documentation `__ @@ -5257,21 +5280,25 @@ Use :func:`sqlalchemy.text` to specify query parameters in a backend-neutral way .. ipython:: python import sqlalchemy as sa - pd.read_sql(sa.text('SELECT * FROM data where Col_1=:col1'), - engine, params={'col1': 'X'}) + + pd.read_sql( + sa.text("SELECT * FROM data where Col_1=:col1"), engine, params={"col1": "X"} + ) If you have an SQLAlchemy description of your database you can express where conditions using SQLAlchemy expressions .. ipython:: python metadata = sa.MetaData() - data_table = sa.Table('data', metadata, - sa.Column('index', sa.Integer), - sa.Column('Date', sa.DateTime), - sa.Column('Col_1', sa.String), - sa.Column('Col_2', sa.Float), - sa.Column('Col_3', sa.Boolean), - ) + data_table = sa.Table( + "data", + metadata, + sa.Column("index", sa.Integer), + sa.Column("Date", sa.DateTime), + sa.Column("Col_1", sa.String), + sa.Column("Col_2", sa.Float), + sa.Column("Col_3", sa.Boolean), + ) pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine) @@ -5280,8 +5307,9 @@ You can combine SQLAlchemy expressions with parameters passed to :func:`read_sql .. ipython:: python import datetime as dt - expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam('date')) - pd.read_sql(expr, engine, params={'date': dt.datetime(2010, 10, 18)}) + + expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam("date")) + pd.read_sql(expr, engine, params={"date": dt.datetime(2010, 10, 18)}) Sqlite fallback @@ -5296,13 +5324,14 @@ You can create connections like so: .. code-block:: python import sqlite3 - con = sqlite3.connect(':memory:') + + con = sqlite3.connect(":memory:") And then issue the following queries: .. code-block:: python - data.to_sql('data', con) + data.to_sql("data", con) pd.read_sql_query("SELECT * FROM data", con) @@ -5339,8 +5368,8 @@ into a .dta file. The format version of this file is always 115 (Stata 12). .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) - df.to_stata('stata.dta') + df = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.to_stata("stata.dta") *Stata* data files have limited data type support; only strings with 244 or fewer characters, ``int8``, ``int16``, ``int32``, ``float32`` @@ -5390,7 +5419,7 @@ be used to read the file incrementally. .. ipython:: python - pd.read_stata('stata.dta') + pd.read_stata("stata.dta") Specifying a ``chunksize`` yields a :class:`~pandas.io.stata.StataReader` instance that can be used to @@ -5399,7 +5428,7 @@ object can be used as an iterator. .. ipython:: python - reader = pd.read_stata('stata.dta', chunksize=3) + reader = pd.read_stata("stata.dta", chunksize=3) for df in reader: print(df.shape) @@ -5409,7 +5438,7 @@ For more fine-grained control, use ``iterator=True`` and specify .. ipython:: python - reader = pd.read_stata('stata.dta', iterator=True) + reader = pd.read_stata("stata.dta", iterator=True) chunk1 = reader.read(5) chunk2 = reader.read(5) @@ -5441,7 +5470,7 @@ values will have ``object`` data type. .. ipython:: python :suppress: - os.remove('stata.dta') + os.remove("stata.dta") .. _io.stata-categorical: @@ -5513,7 +5542,7 @@ Read a SAS7BDAT file: .. code-block:: python - df = pd.read_sas('sas_data.sas7bdat') + df = pd.read_sas("sas_data.sas7bdat") Obtain an iterator and read an XPORT file 100,000 lines at a time: @@ -5522,7 +5551,8 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: def do_something(chunk): pass - rdr = pd.read_sas('sas_xport.xpt', chunk=100000) + + rdr = pd.read_sas("sas_xport.xpt", chunk=100000) for chunk in rdr: do_something(chunk) @@ -5556,15 +5586,14 @@ Read an SPSS file: .. code-block:: python - df = pd.read_spss('spss_data.sav') + df = pd.read_spss("spss_data.sav") Extract a subset of columns contained in ``usecols`` from an SPSS file and avoid converting categorical columns into ``pd.Categorical``: .. code-block:: python - df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'], - convert_categoricals=False) + df = pd.read_spss("spss_data.sav", usecols=["foo", "bar"], convert_categoricals=False) More information about the SAV and ZSAV file formats is available here_. @@ -5622,78 +5651,99 @@ Given the next test set: import os sz = 1000000 - df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) + df = pd.DataFrame({"A": np.random.randn(sz), "B": [1] * sz}) sz = 1000000 np.random.seed(42) - df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) + df = pd.DataFrame({"A": np.random.randn(sz), "B": [1] * sz}) + def test_sql_write(df): - if os.path.exists('test.sql'): - os.remove('test.sql') - sql_db = sqlite3.connect('test.sql') - df.to_sql(name='test_table', con=sql_db) + if os.path.exists("test.sql"): + os.remove("test.sql") + sql_db = sqlite3.connect("test.sql") + df.to_sql(name="test_table", con=sql_db) sql_db.close() + def test_sql_read(): - sql_db = sqlite3.connect('test.sql') + sql_db = sqlite3.connect("test.sql") pd.read_sql_query("select * from test_table", sql_db) sql_db.close() + def test_hdf_fixed_write(df): - df.to_hdf('test_fixed.hdf', 'test', mode='w') + df.to_hdf("test_fixed.hdf", "test", mode="w") + def test_hdf_fixed_read(): - pd.read_hdf('test_fixed.hdf', 'test') + pd.read_hdf("test_fixed.hdf", "test") + def test_hdf_fixed_write_compress(df): - df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') + df.to_hdf("test_fixed_compress.hdf", "test", mode="w", complib="blosc") + def test_hdf_fixed_read_compress(): - pd.read_hdf('test_fixed_compress.hdf', 'test') + pd.read_hdf("test_fixed_compress.hdf", "test") + def test_hdf_table_write(df): - df.to_hdf('test_table.hdf', 'test', mode='w', format='table') + df.to_hdf("test_table.hdf", "test", mode="w", format="table") + def test_hdf_table_read(): - pd.read_hdf('test_table.hdf', 'test') + pd.read_hdf("test_table.hdf", "test") + def test_hdf_table_write_compress(df): - df.to_hdf('test_table_compress.hdf', 'test', mode='w', - complib='blosc', format='table') + df.to_hdf( + "test_table_compress.hdf", "test", mode="w", complib="blosc", format="table" + ) + def test_hdf_table_read_compress(): - pd.read_hdf('test_table_compress.hdf', 'test') + pd.read_hdf("test_table_compress.hdf", "test") + def test_csv_write(df): - df.to_csv('test.csv', mode='w') + df.to_csv("test.csv", mode="w") + def test_csv_read(): - pd.read_csv('test.csv', index_col=0) + pd.read_csv("test.csv", index_col=0) + def test_feather_write(df): - df.to_feather('test.feather') + df.to_feather("test.feather") + def test_feather_read(): - pd.read_feather('test.feather') + pd.read_feather("test.feather") + def test_pickle_write(df): - df.to_pickle('test.pkl') + df.to_pickle("test.pkl") + def test_pickle_read(): - pd.read_pickle('test.pkl') + pd.read_pickle("test.pkl") + def test_pickle_write_compress(df): - df.to_pickle('test.pkl.compress', compression='xz') + df.to_pickle("test.pkl.compress", compression="xz") + def test_pickle_read_compress(): - pd.read_pickle('test.pkl.compress', compression='xz') + pd.read_pickle("test.pkl.compress", compression="xz") + def test_parquet_write(df): - df.to_parquet('test.parquet') + df.to_parquet("test.parquet") + def test_parquet_read(): - pd.read_parquet('test.parquet') + pd.read_parquet("test.parquet") When writing, the top-three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index dd6ac37d88f08..2ada09117273d 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -46,20 +46,20 @@ infer a list of strings to .. ipython:: python - pd.Series(['a', 'b', 'c']) + pd.Series(["a", "b", "c"]) To explicitly request ``string`` dtype, specify the ``dtype`` .. ipython:: python - pd.Series(['a', 'b', 'c'], dtype="string") - pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()) + pd.Series(["a", "b", "c"], dtype="string") + pd.Series(["a", "b", "c"], dtype=pd.StringDtype()) Or ``astype`` after the ``Series`` or ``DataFrame`` is created .. ipython:: python - s = pd.Series(['a', 'b', 'c']) + s = pd.Series(["a", "b", "c"]) s s.astype("string") @@ -71,7 +71,7 @@ it will be converted to ``string`` dtype: .. ipython:: python - s = pd.Series(['a', 2, np.nan], dtype="string") + s = pd.Series(["a", 2, np.nan], dtype="string") s type(s[1]) @@ -147,15 +147,16 @@ the equivalent (scalar) built-in string methods: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str.lower() s.str.upper() s.str.len() .. ipython:: python - idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank']) + idx = pd.Index([" jack", "jill ", " jesse ", "frank"]) idx.str.strip() idx.str.lstrip() idx.str.rstrip() @@ -166,8 +167,9 @@ leading or trailing whitespace: .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 2), - columns=[' Column A ', ' Column B '], index=range(3)) + df = pd.DataFrame( + np.random.randn(3, 2), columns=[" Column A ", " Column B "], index=range(3) + ) df Since ``df.columns`` is an Index object, we can use the ``.str`` accessor @@ -183,7 +185,7 @@ and replacing any remaining whitespaces with underscores: .. ipython:: python - df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') + df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_") df .. note:: @@ -221,21 +223,21 @@ Methods like ``split`` return a Series of lists: .. ipython:: python - s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string") - s2.str.split('_') + s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string") + s2.str.split("_") Elements in the split lists can be accessed using ``get`` or ``[]`` notation: .. ipython:: python - s2.str.split('_').str.get(1) - s2.str.split('_').str[1] + s2.str.split("_").str.get(1) + s2.str.split("_").str[1] It is easy to expand this to return a DataFrame using ``expand``. .. ipython:: python - s2.str.split('_', expand=True) + s2.str.split("_", expand=True) When original ``Series`` has :class:`StringDtype`, the output columns will all be :class:`StringDtype` as well. @@ -244,25 +246,25 @@ It is also possible to limit the number of splits: .. ipython:: python - s2.str.split('_', expand=True, n=1) + s2.str.split("_", expand=True, n=1) ``rsplit`` is similar to ``split`` except it works in the reverse direction, i.e., from the end of the string to the beginning of the string: .. ipython:: python - s2.str.rsplit('_', expand=True, n=1) + s2.str.rsplit("_", expand=True, n=1) ``replace`` by default replaces `regular expressions `__: .. ipython:: python - s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', - '', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s3 = pd.Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s3 - s3.str.replace('^.a|dog', 'XX-XX ', case=False) + s3.str.replace("^.a|dog", "XX-XX ", case=False) Some caution must be taken to keep regular expressions in mind! For example, the following code will cause trouble because of the regular expression meaning of @@ -271,16 +273,16 @@ following code will cause trouble because of the regular expression meaning of .. ipython:: python # Consider the following badly formatted financial data - dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string") + dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string") # This does what you'd naively expect: - dollars.str.replace('$', '') + dollars.str.replace("$", "") # But this doesn't: - dollars.str.replace('-$', '-') + dollars.str.replace("-$", "-") # We need to escape the special character (for >1 len patterns) - dollars.str.replace(r'-\$', '-') + dollars.str.replace(r"-\$", "-") If you do want literal replacement of a string (equivalent to :meth:`str.replace`), you can set the optional ``regex`` parameter to @@ -290,8 +292,8 @@ and ``repl`` must be strings: .. ipython:: python # These lines are equivalent - dollars.str.replace(r'-\$', '-') - dollars.str.replace('-$', '-', regex=False) + dollars.str.replace(r"-\$", "-") + dollars.str.replace("-$", "-", regex=False) The ``replace`` method can also take a callable as replacement. It is called on every ``pat`` using :func:`re.sub`. The callable should expect one @@ -300,22 +302,24 @@ positional argument (a regex object) and return a string. .. ipython:: python # Reverse every lowercase alphabetic word - pat = r'[a-z]+' + pat = r"[a-z]+" + def repl(m): return m.group(0)[::-1] - pd.Series(['foo 123', 'bar baz', np.nan], - dtype="string").str.replace(pat, repl) + + pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace(pat, repl) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" + def repl(m): - return m.group('two').swapcase() + return m.group("two").swapcase() - pd.Series(['Foo Bar Baz', np.nan], - dtype="string").str.replace(pat, repl) + + pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(pat, repl) The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the @@ -324,8 +328,9 @@ compiled regular expression object. .. ipython:: python import re - regex_pat = re.compile(r'^.a|dog', flags=re.IGNORECASE) - s3.str.replace(regex_pat, 'XX-XX ') + + regex_pat = re.compile(r"^.a|dog", flags=re.IGNORECASE) + s3.str.replace(regex_pat, "XX-XX ") Including a ``flags`` argument when calling ``replace`` with a compiled regular expression object will raise a ``ValueError``. @@ -352,8 +357,8 @@ The content of a ``Series`` (or ``Index``) can be concatenated: .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], dtype="string") - s.str.cat(sep=',') + s = pd.Series(["a", "b", "c", "d"], dtype="string") + s.str.cat(sep=",") If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``: @@ -365,9 +370,9 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re .. ipython:: python - t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string") - t.str.cat(sep=',') - t.str.cat(sep=',', na_rep='-') + t = pd.Series(["a", "b", np.nan, "d"], dtype="string") + t.str.cat(sep=",") + t.str.cat(sep=",", na_rep="-") Concatenating a Series and something list-like into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -376,14 +381,14 @@ The first argument to :meth:`~Series.str.cat` can be a list-like object, provide .. ipython:: python - s.str.cat(['A', 'B', 'C', 'D']) + s.str.cat(["A", "B", "C", "D"]) Missing values on either side will result in missing values in the result as well, *unless* ``na_rep`` is specified: .. ipython:: python s.str.cat(t) - s.str.cat(t, na_rep='-') + s.str.cat(t, na_rep="-") Concatenating a Series and something array-like into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -395,7 +400,7 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o d = pd.concat([t, s], axis=1) s d - s.str.cat(d, na_rep='-') + s.str.cat(d, na_rep="-") Concatenating a Series and an indexed object into a Series, with alignment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -406,12 +411,11 @@ the ``join``-keyword. .. ipython:: python :okwarning: - u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2], - dtype="string") + u = pd.Series(["b", "d", "a", "c"], index=[1, 3, 0, 2], dtype="string") s u s.str.cat(u) - s.str.cat(u, join='left') + s.str.cat(u, join="left") .. warning:: @@ -423,12 +427,11 @@ In particular, alignment also means that the different lengths do not need to co .. ipython:: python - v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4], - dtype="string") + v = pd.Series(["z", "a", "b", "d", "e"], index=[-1, 0, 1, 3, 4], dtype="string") s v - s.str.cat(v, join='left', na_rep='-') - s.str.cat(v, join='outer', na_rep='-') + s.str.cat(v, join="left", na_rep="-") + s.str.cat(v, join="outer", na_rep="-") The same alignment can be used when ``others`` is a ``DataFrame``: @@ -437,7 +440,7 @@ The same alignment can be used when ``others`` is a ``DataFrame``: f = d.loc[[3, 2, 1, 0], :] s f - s.str.cat(f, join='left', na_rep='-') + s.str.cat(f, join="left", na_rep="-") Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -449,7 +452,7 @@ can be combined in a list-like container (including iterators, ``dict``-views, e s u - s.str.cat([u, u.to_numpy()], join='left') + s.str.cat([u, u.to_numpy()], join="left") All elements without an index (e.g. ``np.ndarray``) within the passed list-like must match in length to the calling ``Series`` (or ``Index``), but ``Series`` and ``Index`` may have arbitrary length (as long as alignment is not disabled with ``join=None``): @@ -457,7 +460,7 @@ but ``Series`` and ``Index`` may have arbitrary length (as long as alignment is .. ipython:: python v - s.str.cat([v, u, u.to_numpy()], join='outer', na_rep='-') + s.str.cat([v, u, u.to_numpy()], join="outer", na_rep="-") If using ``join='right'`` on a list-like of ``others`` that contains different indexes, the union of these indexes will be used as the basis for the final concatenation: @@ -466,7 +469,7 @@ the union of these indexes will be used as the basis for the final concatenation u.loc[[3]] v.loc[[-1, 0]] - s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join='right', na_rep='-') + s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join="right", na_rep="-") Indexing with ``.str`` ---------------------- @@ -479,9 +482,9 @@ of the string, the result will be a ``NaN``. .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, - 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str[0] s.str[1] @@ -512,8 +515,7 @@ DataFrame with one column per group. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'([ab])(\d)', expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"([ab])(\d)", expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -526,16 +528,15 @@ Named groups like .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'(?P[ab])(?P\d)', - expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract( + r"(?P[ab])(?P\d)", expand=False + ) and optional groups like .. ipython:: python - pd.Series(['a1', 'b2', '3'], - dtype="string").str.extract(r'([ab])?(\d)', expand=False) + pd.Series(["a1", "b2", "3"], dtype="string").str.extract(r"([ab])?(\d)", expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -546,23 +547,20 @@ with one column if ``expand=True``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=True) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=True) It returns a Series if ``expand=False``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=False) Calling on an ``Index`` with a regex with exactly one capture group returns a ``DataFrame`` with one column if ``expand=True``. .. ipython:: python - s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], - dtype="string") + s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], dtype="string") s s.index.str.extract("(?P[a-zA-Z])", expand=True) @@ -607,10 +605,9 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], - dtype="string") + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], dtype="string") s - two_groups = '(?P[a-z])(?P[0-9])' + two_groups = "(?P[a-z])(?P[0-9])" s.str.extract(two_groups, expand=True) the ``extractall`` method returns every match. The result of @@ -626,7 +623,7 @@ When each subject string in the Series has exactly one match, .. ipython:: python - s = pd.Series(['a3', 'b3', 'c2'], dtype="string") + s = pd.Series(["a3", "b3", "c2"], dtype="string") s then ``extractall(pat).xs(0, level='match')`` gives the same result as @@ -657,23 +654,20 @@ You can check whether elements contain a pattern: .. ipython:: python - pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.contains(pattern) + pattern = r"[0-9][a-z]" + pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.match(pattern) + pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.match(pattern) .. versionadded:: 1.1.0 .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.fullmatch(pattern) + pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.fullmatch(pattern) .. note:: @@ -695,9 +689,10 @@ True or False: .. ipython:: python - s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") - s4.str.contains('A', na=False) + s4 = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) + s4.str.contains("A", na=False) .. _text.indicator: @@ -709,15 +704,15 @@ For example if they are separated by a ``'|'``: .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string") - s.str.get_dummies(sep='|') + s = pd.Series(["a", "a|b", np.nan, "a|c"], dtype="string") + s.str.get_dummies(sep="|") String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``. .. ipython:: python - idx = pd.Index(['a', 'a|b', np.nan, 'a|c']) - idx.str.get_dummies(sep='|') + idx = pd.Index(["a", "a|b", np.nan, "a|c"]) + idx.str.get_dummies(sep="|") See also :func:`~pandas.get_dummies`.