Skip to content

Commit be9890d

Browse files
itholicHyukjinKwon
authored andcommitted
Top-level Koalas functions - merge (#969)
Resolves #961 ```python >>> df1 = ks.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [1, 2, 3, 5]}, ... columns=['lkey', 'value']) >>> df2 = ks.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [5, 6, 7, 8]}, ... columns=['rkey', 'value']) >>> df1 lkey value 0 foo 1 1 bar 2 2 baz 3 3 foo 5 >>> df2 rkey value 0 foo 5 1 bar 6 2 baz 7 3 foo 8 >>> merged = ks.merge(df1, df2, left_on='lkey', right_on='rkey') >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) lkey value_x rkey value_y 0 bar 2 bar 6 5 baz 3 baz 7 1 foo 1 foo 5 2 foo 1 foo 8 3 foo 5 foo 5 4 foo 5 foo 8 >>> left_kdf = ks.DataFrame({'A': [1, 2]}) >>> right_kdf = ks.DataFrame({'B': ['x', 'y']}, index=[1, 2]) >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True) A B 1 2 x >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='left') A B 0 1 None 1 2 x >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='right') A B 1 2.0 x 2 NaN y >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='outer') A B 0 1.0 None 1 2.0 x 2 NaN y ```
1 parent 8303ab1 commit be9890d

File tree

2 files changed

+124
-1
lines changed

2 files changed

+124
-1
lines changed

databricks/koalas/namespace.py

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
Wrappers around spark that correspond to common pandas functions.
1919
"""
2020
from typing import Optional, Union, List
21+
from typing import Optional, Union, List, Tuple
2122
from collections import OrderedDict
2223
from collections.abc import Iterable
2324
from functools import reduce
@@ -43,7 +44,7 @@
4344
__all__ = ["from_pandas", "range", "read_csv", "read_delta", "read_table", "read_spark_io",
4445
"read_parquet", "read_clipboard", "read_excel", "read_html", "to_datetime",
4546
"get_dummies", "concat", "melt", "isna", "isnull", "notna", "notnull",
46-
"read_sql_table", "read_sql_query", "read_sql", "read_json"]
47+
"read_sql_table", "read_sql_query", "read_sql", "read_json", "merge"]
4748

4849

4950
def from_pandas(pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'DataFrame']:
@@ -1737,6 +1738,127 @@ def notna(obj):
17371738
notnull = notna
17381739

17391740

1741+
def merge(obj, right: 'DataFrame', how: str = 'inner',
1742+
on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None,
1743+
left_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None,
1744+
right_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None,
1745+
left_index: bool = False, right_index: bool = False,
1746+
suffixes: Tuple[str, str] = ('_x', '_y')) -> 'DataFrame':
1747+
"""
1748+
Merge DataFrame objects with a database-style join.
1749+
1750+
The index of the resulting DataFrame will be one of the following:
1751+
- 0...n if no index is used for merging
1752+
- Index of the left DataFrame if merged only on the index of the right DataFrame
1753+
- Index of the right DataFrame if merged only on the index of the left DataFrame
1754+
- All involved indices if merged using the indices of both DataFrames
1755+
e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will
1756+
be an index (x, a, b)
1757+
1758+
Parameters
1759+
----------
1760+
right: Object to merge with.
1761+
how: Type of merge to be performed.
1762+
{'left', 'right', 'outer', 'inner'}, default 'inner'
1763+
1764+
left: use only keys from left frame, similar to a SQL left outer join; preserve key
1765+
order.
1766+
right: use only keys from right frame, similar to a SQL right outer join; preserve key
1767+
order.
1768+
outer: use union of keys from both frames, similar to a SQL full outer join; sort keys
1769+
lexicographically.
1770+
inner: use intersection of keys from both frames, similar to a SQL inner join;
1771+
preserve the order of the left keys.
1772+
on: Column or index level names to join on. These must be found in both DataFrames. If on
1773+
is None and not merging on indexes then this defaults to the intersection of the
1774+
columns in both DataFrames.
1775+
left_on: Column or index level names to join on in the left DataFrame. Can also
1776+
be an array or list of arrays of the length of the left DataFrame.
1777+
These arrays are treated as if they are columns.
1778+
right_on: Column or index level names to join on in the right DataFrame. Can also
1779+
be an array or list of arrays of the length of the right DataFrame.
1780+
These arrays are treated as if they are columns.
1781+
left_index: Use the index from the left DataFrame as the join key(s). If it is a
1782+
MultiIndex, the number of keys in the other DataFrame (either the index or a number of
1783+
columns) must match the number of levels.
1784+
right_index: Use the index from the right DataFrame as the join key. Same caveats as
1785+
left_index.
1786+
suffixes: Suffix to apply to overlapping column names in the left and right side,
1787+
respectively.
1788+
1789+
Returns
1790+
-------
1791+
DataFrame
1792+
A DataFrame of the two merged objects.
1793+
1794+
Examples
1795+
--------
1796+
1797+
>>> df1 = ks.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
1798+
... 'value': [1, 2, 3, 5]},
1799+
... columns=['lkey', 'value'])
1800+
>>> df2 = ks.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
1801+
... 'value': [5, 6, 7, 8]},
1802+
... columns=['rkey', 'value'])
1803+
>>> df1
1804+
lkey value
1805+
0 foo 1
1806+
1 bar 2
1807+
2 baz 3
1808+
3 foo 5
1809+
>>> df2
1810+
rkey value
1811+
0 foo 5
1812+
1 bar 6
1813+
2 baz 7
1814+
3 foo 8
1815+
1816+
Merge df1 and df2 on the lkey and rkey columns. The value columns have
1817+
the default suffixes, _x and _y, appended.
1818+
1819+
>>> merged = ks.merge(df1, df2, left_on='lkey', right_on='rkey')
1820+
>>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y'])
1821+
lkey value_x rkey value_y
1822+
0 bar 2 bar 6
1823+
5 baz 3 baz 7
1824+
1 foo 1 foo 5
1825+
2 foo 1 foo 8
1826+
3 foo 5 foo 5
1827+
4 foo 5 foo 8
1828+
1829+
>>> left_kdf = ks.DataFrame({'A': [1, 2]})
1830+
>>> right_kdf = ks.DataFrame({'B': ['x', 'y']}, index=[1, 2])
1831+
1832+
>>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True)
1833+
A B
1834+
1 2 x
1835+
1836+
>>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='left')
1837+
A B
1838+
0 1 None
1839+
1 2 x
1840+
1841+
>>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='right')
1842+
A B
1843+
1 2.0 x
1844+
2 NaN y
1845+
1846+
>>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='outer')
1847+
A B
1848+
0 1.0 None
1849+
1 2.0 x
1850+
2 NaN y
1851+
1852+
Notes
1853+
-----
1854+
As described in #263, joining string columns currently returns None for missing values
1855+
instead of NaN.
1856+
"""
1857+
return obj.merge(
1858+
right, how=how, on=on, left_on=left_on, right_on=right_on,
1859+
left_index=left_index, right_index=right_index, suffixes=suffixes)
1860+
1861+
17401862
# @pandas_wraps(return_col=np.datetime64)
17411863
@pandas_wraps
17421864
def _to_datetime1(arg, errors, format, unit, infer_datetime_format,

docs/source/reference/general_functions.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Data manipulations and SQL
2121
:toctree: api/
2222

2323
melt
24+
merge
2425
get_dummies
2526
concat
2627
sql

0 commit comments

Comments
 (0)