Top-level Koalas functions - merge (#969)

itholic · HyukjinKwon · commit be9890db6979 · 2019-10-30T15:45:50.000+09:00
Resolves #961 ```python >>> df1 = ks.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [1, 2, 3, 5]}, ... columns=['lkey', 'value']) >>> df2 = ks.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [5, 6, 7, 8]}, ... columns=['rkey', 'value']) >>> df1 lkey value 0 foo 1 1 bar 2 2 baz 3 3 foo 5 >>> df2 rkey value 0 foo 5 1 bar 6 2 baz 7 3 foo 8 >>> merged = ks.merge(df1, df2, left_on='lkey', right_on='rkey') >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) lkey value_x rkey value_y 0 bar 2 bar 6 5 baz 3 baz 7 1 foo 1 foo 5 2 foo 1 foo 8 3 foo 5 foo 5 4 foo 5 foo 8 >>> left_kdf = ks.DataFrame({'A': [1, 2]}) >>> right_kdf = ks.DataFrame({'B': ['x', 'y']}, index=[1, 2]) >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True) A B 1 2 x >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='left') A B 0 1 None 1 2 x >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='right') A B 1 2.0 x 2 NaN y >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='outer') A B 0 1.0 None 1 2.0 x 2 NaN y ```
diff --git a/databricks/koalas/namespace.py b/databricks/koalas/namespace.py
@@ -18,6 +18,7 @@
 Wrappers around spark that correspond to common pandas functions.
 """
 from typing import Optional, Union, List
+from typing import Optional, Union, List, Tuple
 from collections import OrderedDict
 from collections.abc import Iterable
 from functools import reduce
@@ -43,7 +44,7 @@
 __all__ = ["from_pandas", "range", "read_csv", "read_delta", "read_table", "read_spark_io",
            "read_parquet", "read_clipboard", "read_excel", "read_html", "to_datetime",
            "get_dummies", "concat", "melt", "isna", "isnull", "notna", "notnull",
-           "read_sql_table", "read_sql_query", "read_sql", "read_json"]
+           "read_sql_table", "read_sql_query", "read_sql", "read_json", "merge"]
 
 
 def from_pandas(pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'DataFrame']:
@@ -1737,6 +1738,127 @@ def notna(obj):
 notnull = notna
 
 
+def merge(obj, right: 'DataFrame', how: str = 'inner',
+          on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None,
+          left_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None,
+          right_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None,
+          left_index: bool = False, right_index: bool = False,
+          suffixes: Tuple[str, str] = ('_x', '_y')) -> 'DataFrame':
+    """
+    Merge DataFrame objects with a database-style join.
+
+    The index of the resulting DataFrame will be one of the following:
+        - 0...n if no index is used for merging
+        - Index of the left DataFrame if merged only on the index of the right DataFrame
+        - Index of the right DataFrame if merged only on the index of the left DataFrame
+        - All involved indices if merged using the indices of both DataFrames
+            e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will
+            be an index (x, a, b)
+
+    Parameters
+    ----------
+    right: Object to merge with.
+    how: Type of merge to be performed.
+        {'left', 'right', 'outer', 'inner'}, default 'inner'
+
+        left: use only keys from left frame, similar to a SQL left outer join; preserve key
+            order.
+        right: use only keys from right frame, similar to a SQL right outer join; preserve key
+            order.
+        outer: use union of keys from both frames, similar to a SQL full outer join; sort keys
+            lexicographically.
+        inner: use intersection of keys from both frames, similar to a SQL inner join;
+            preserve the order of the left keys.
+    on: Column or index level names to join on. These must be found in both DataFrames. If on
+        is None and not merging on indexes then this defaults to the intersection of the
+        columns in both DataFrames.
+    left_on: Column or index level names to join on in the left DataFrame. Can also
+        be an array or list of arrays of the length of the left DataFrame.
+        These arrays are treated as if they are columns.
+    right_on: Column or index level names to join on in the right DataFrame. Can also
+        be an array or list of arrays of the length of the right DataFrame.
+        These arrays are treated as if they are columns.
+    left_index: Use the index from the left DataFrame as the join key(s). If it is a
+        MultiIndex, the number of keys in the other DataFrame (either the index or a number of
+        columns) must match the number of levels.
+    right_index: Use the index from the right DataFrame as the join key. Same caveats as
+        left_index.
+    suffixes: Suffix to apply to overlapping column names in the left and right side,
+        respectively.
+
+    Returns
+    -------
+    DataFrame
+        A DataFrame of the two merged objects.
+
+    Examples
+    --------
+
+    >>> df1 = ks.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
+    ...                     'value': [1, 2, 3, 5]},
+    ...                    columns=['lkey', 'value'])
+    >>> df2 = ks.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
+    ...                     'value': [5, 6, 7, 8]},
+    ...                    columns=['rkey', 'value'])
+    >>> df1
+      lkey  value
+    0  foo      1
+    1  bar      2
+    2  baz      3
+    3  foo      5
+    >>> df2
+      rkey  value
+    0  foo      5
+    1  bar      6
+    2  baz      7
+    3  foo      8
+
+    Merge df1 and df2 on the lkey and rkey columns. The value columns have
+    the default suffixes, _x and _y, appended.
+
+    >>> merged = ks.merge(df1, df2, left_on='lkey', right_on='rkey')
+    >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y'])
+      lkey  value_x rkey  value_y
+    0  bar        2  bar        6
+    5  baz        3  baz        7
+    1  foo        1  foo        5
+    2  foo        1  foo        8
+    3  foo        5  foo        5
+    4  foo        5  foo        8
+
+    >>> left_kdf = ks.DataFrame({'A': [1, 2]})
+    >>> right_kdf = ks.DataFrame({'B': ['x', 'y']}, index=[1, 2])
+
+    >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True)
+       A  B
+    1  2  x
+
+    >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='left')
+       A     B
+    0  1  None
+    1  2     x
+
+    >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='right')
+         A  B
+    1  2.0  x
+    2  NaN  y
+
+    >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='outer')
+         A     B
+    0  1.0  None
+    1  2.0     x
+    2  NaN     y
+
+    Notes
+    -----
+    As described in #263, joining string columns currently returns None for missing values
+        instead of NaN.
+    """
+    return obj.merge(
+        right, how=how, on=on, left_on=left_on, right_on=right_on,
+        left_index=left_index, right_index=right_index, suffixes=suffixes)
+
+
 # @pandas_wraps(return_col=np.datetime64)
 @pandas_wraps
 def _to_datetime1(arg, errors, format, unit, infer_datetime_format,
diff --git a/docs/source/reference/general_functions.rst b/docs/source/reference/general_functions.rst
@@ -21,6 +21,7 @@ Data manipulations and SQL
    :toctree: api/
 
    melt
+   merge
    get_dummies
    concat
    sql