|
18 | 18 | Wrappers around spark that correspond to common pandas functions. |
19 | 19 | """ |
20 | 20 | from typing import Optional, Union, List |
| 21 | +from typing import Optional, Union, List, Tuple |
21 | 22 | from collections import OrderedDict |
22 | 23 | from collections.abc import Iterable |
23 | 24 | from functools import reduce |
|
43 | 44 | __all__ = ["from_pandas", "range", "read_csv", "read_delta", "read_table", "read_spark_io", |
44 | 45 | "read_parquet", "read_clipboard", "read_excel", "read_html", "to_datetime", |
45 | 46 | "get_dummies", "concat", "melt", "isna", "isnull", "notna", "notnull", |
46 | | - "read_sql_table", "read_sql_query", "read_sql", "read_json"] |
| 47 | + "read_sql_table", "read_sql_query", "read_sql", "read_json", "merge"] |
47 | 48 |
|
48 | 49 |
|
49 | 50 | def from_pandas(pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'DataFrame']: |
@@ -1737,6 +1738,127 @@ def notna(obj): |
1737 | 1738 | notnull = notna |
1738 | 1739 |
|
1739 | 1740 |
|
| 1741 | +def merge(obj, right: 'DataFrame', how: str = 'inner', |
| 1742 | + on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, |
| 1743 | + left_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, |
| 1744 | + right_on: Union[str, List[str], Tuple[str, ...], List[Tuple[str, ...]]] = None, |
| 1745 | + left_index: bool = False, right_index: bool = False, |
| 1746 | + suffixes: Tuple[str, str] = ('_x', '_y')) -> 'DataFrame': |
| 1747 | + """ |
| 1748 | + Merge DataFrame objects with a database-style join. |
| 1749 | +
|
| 1750 | + The index of the resulting DataFrame will be one of the following: |
| 1751 | + - 0...n if no index is used for merging |
| 1752 | + - Index of the left DataFrame if merged only on the index of the right DataFrame |
| 1753 | + - Index of the right DataFrame if merged only on the index of the left DataFrame |
| 1754 | + - All involved indices if merged using the indices of both DataFrames |
| 1755 | + e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will |
| 1756 | + be an index (x, a, b) |
| 1757 | +
|
| 1758 | + Parameters |
| 1759 | + ---------- |
| 1760 | + right: Object to merge with. |
| 1761 | + how: Type of merge to be performed. |
| 1762 | + {'left', 'right', 'outer', 'inner'}, default 'inner' |
| 1763 | +
|
| 1764 | + left: use only keys from left frame, similar to a SQL left outer join; preserve key |
| 1765 | + order. |
| 1766 | + right: use only keys from right frame, similar to a SQL right outer join; preserve key |
| 1767 | + order. |
| 1768 | + outer: use union of keys from both frames, similar to a SQL full outer join; sort keys |
| 1769 | + lexicographically. |
| 1770 | + inner: use intersection of keys from both frames, similar to a SQL inner join; |
| 1771 | + preserve the order of the left keys. |
| 1772 | + on: Column or index level names to join on. These must be found in both DataFrames. If on |
| 1773 | + is None and not merging on indexes then this defaults to the intersection of the |
| 1774 | + columns in both DataFrames. |
| 1775 | + left_on: Column or index level names to join on in the left DataFrame. Can also |
| 1776 | + be an array or list of arrays of the length of the left DataFrame. |
| 1777 | + These arrays are treated as if they are columns. |
| 1778 | + right_on: Column or index level names to join on in the right DataFrame. Can also |
| 1779 | + be an array or list of arrays of the length of the right DataFrame. |
| 1780 | + These arrays are treated as if they are columns. |
| 1781 | + left_index: Use the index from the left DataFrame as the join key(s). If it is a |
| 1782 | + MultiIndex, the number of keys in the other DataFrame (either the index or a number of |
| 1783 | + columns) must match the number of levels. |
| 1784 | + right_index: Use the index from the right DataFrame as the join key. Same caveats as |
| 1785 | + left_index. |
| 1786 | + suffixes: Suffix to apply to overlapping column names in the left and right side, |
| 1787 | + respectively. |
| 1788 | +
|
| 1789 | + Returns |
| 1790 | + ------- |
| 1791 | + DataFrame |
| 1792 | + A DataFrame of the two merged objects. |
| 1793 | +
|
| 1794 | + Examples |
| 1795 | + -------- |
| 1796 | +
|
| 1797 | + >>> df1 = ks.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], |
| 1798 | + ... 'value': [1, 2, 3, 5]}, |
| 1799 | + ... columns=['lkey', 'value']) |
| 1800 | + >>> df2 = ks.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], |
| 1801 | + ... 'value': [5, 6, 7, 8]}, |
| 1802 | + ... columns=['rkey', 'value']) |
| 1803 | + >>> df1 |
| 1804 | + lkey value |
| 1805 | + 0 foo 1 |
| 1806 | + 1 bar 2 |
| 1807 | + 2 baz 3 |
| 1808 | + 3 foo 5 |
| 1809 | + >>> df2 |
| 1810 | + rkey value |
| 1811 | + 0 foo 5 |
| 1812 | + 1 bar 6 |
| 1813 | + 2 baz 7 |
| 1814 | + 3 foo 8 |
| 1815 | +
|
| 1816 | + Merge df1 and df2 on the lkey and rkey columns. The value columns have |
| 1817 | + the default suffixes, _x and _y, appended. |
| 1818 | +
|
| 1819 | + >>> merged = ks.merge(df1, df2, left_on='lkey', right_on='rkey') |
| 1820 | + >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) |
| 1821 | + lkey value_x rkey value_y |
| 1822 | + 0 bar 2 bar 6 |
| 1823 | + 5 baz 3 baz 7 |
| 1824 | + 1 foo 1 foo 5 |
| 1825 | + 2 foo 1 foo 8 |
| 1826 | + 3 foo 5 foo 5 |
| 1827 | + 4 foo 5 foo 8 |
| 1828 | +
|
| 1829 | + >>> left_kdf = ks.DataFrame({'A': [1, 2]}) |
| 1830 | + >>> right_kdf = ks.DataFrame({'B': ['x', 'y']}, index=[1, 2]) |
| 1831 | +
|
| 1832 | + >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True) |
| 1833 | + A B |
| 1834 | + 1 2 x |
| 1835 | +
|
| 1836 | + >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='left') |
| 1837 | + A B |
| 1838 | + 0 1 None |
| 1839 | + 1 2 x |
| 1840 | +
|
| 1841 | + >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='right') |
| 1842 | + A B |
| 1843 | + 1 2.0 x |
| 1844 | + 2 NaN y |
| 1845 | +
|
| 1846 | + >>> ks.merge(left_kdf, right_kdf, left_index=True, right_index=True, how='outer') |
| 1847 | + A B |
| 1848 | + 0 1.0 None |
| 1849 | + 1 2.0 x |
| 1850 | + 2 NaN y |
| 1851 | +
|
| 1852 | + Notes |
| 1853 | + ----- |
| 1854 | + As described in #263, joining string columns currently returns None for missing values |
| 1855 | + instead of NaN. |
| 1856 | + """ |
| 1857 | + return obj.merge( |
| 1858 | + right, how=how, on=on, left_on=left_on, right_on=right_on, |
| 1859 | + left_index=left_index, right_index=right_index, suffixes=suffixes) |
| 1860 | + |
| 1861 | + |
1740 | 1862 | # @pandas_wraps(return_col=np.datetime64) |
1741 | 1863 | @pandas_wraps |
1742 | 1864 | def _to_datetime1(arg, errors, format, unit, infer_datetime_format, |
|
0 commit comments