|
17 | 17 | """ |
18 | 18 | A wrapper class for Spark DataFrame to behave similar to pandas DataFrame. |
19 | 19 | """ |
| 20 | +from collections import OrderedDict |
20 | 21 | from distutils.version import LooseVersion |
21 | 22 | import re |
22 | 23 | import warnings |
23 | 24 | import inspect |
24 | 25 | from functools import partial, reduce |
25 | 26 | import sys |
| 27 | +from itertools import zip_longest |
26 | 28 | from typing import Any, Optional, List, Tuple, Union, Generic, TypeVar |
27 | 29 |
|
28 | 30 | import numpy as np |
|
42 | 44 | from pyspark.sql.functions import pandas_udf |
43 | 45 |
|
44 | 46 | from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. |
45 | | -from databricks.koalas.utils import validate_arguments_and_invoke_function |
| 47 | +from databricks.koalas.utils import validate_arguments_and_invoke_function, align_diff_frames |
46 | 48 | from databricks.koalas.generic import _Frame, max_display_count |
47 | 49 | from databricks.koalas.internal import _InternalFrame, IndexMap |
48 | 50 | from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame |
@@ -386,19 +388,33 @@ def calculate_columns_axis(*cols): |
386 | 388 |
|
387 | 389 | # Arithmetic Operators |
388 | 390 | def _map_series_op(self, op, other): |
389 | | - if isinstance(other, DataFrame) or is_sequence(other): |
| 391 | + if not isinstance(other, DataFrame) and is_sequence(other): |
390 | 392 | raise ValueError( |
391 | | - "%s with another DataFrame or a sequence is currently not supported; " |
| 393 | + "%s with a sequence is currently not supported; " |
392 | 394 | "however, got %s." % (op, type(other))) |
393 | 395 |
|
394 | 396 | applied = [] |
395 | | - for column in self._internal.data_columns: |
396 | | - applied.append(getattr(self[column], op)(other)) |
| 397 | + if isinstance(other, DataFrame) and self is not other: |
| 398 | + # Different DataFrames |
| 399 | + def apply_op(kdf, this_columns, that_columns): |
| 400 | + for this_column, that_column in zip(this_columns, that_columns): |
| 401 | + yield getattr(kdf[this_column], op)(kdf[that_column]) |
| 402 | + |
| 403 | + return align_diff_frames( |
| 404 | + apply_op, self, other, fillna=True, how="full", include_all_that_columns=False) |
| 405 | + elif isinstance(other, DataFrame) and self is not other: |
| 406 | + # Same DataFrames |
| 407 | + for column in self._internal.data_columns: |
| 408 | + applied.append(getattr(self[column], op)(other[column])) |
| 409 | + else: |
| 410 | + # DataFrame and Series |
| 411 | + for column in self._internal.data_columns: |
| 412 | + applied.append(getattr(self[column], op)(other)) |
397 | 413 |
|
398 | | - sdf = self._sdf.select( |
399 | | - self._internal.index_scols + [c._scol for c in applied]) |
400 | | - internal = self._internal.copy(sdf=sdf, data_columns=[c.name for c in applied]) |
401 | | - return DataFrame(internal) |
| 414 | + sdf = self._sdf.select( |
| 415 | + self._internal.index_scols + [c._scol for c in applied]) |
| 416 | + internal = self._internal.copy(sdf=sdf, data_columns=[c.name for c in applied]) |
| 417 | + return DataFrame(internal) |
402 | 418 |
|
403 | 419 | def __add__(self, other): |
404 | 420 | return self._map_series_op("add", other) |
@@ -6337,17 +6353,36 @@ def __getitem__(self, key): |
6337 | 6353 |
|
6338 | 6354 | def __setitem__(self, key, value): |
6339 | 6355 | from databricks.koalas.series import Series |
6340 | | - # For now, we don't support realignment against different dataframes. |
6341 | | - # This is too expensive in Spark. |
6342 | | - # Are we assigning against a column? |
6343 | | - if isinstance(value, Series): |
6344 | | - assert value._kdf is self, \ |
6345 | | - "Cannot combine column argument because it comes from a different dataframe" |
6346 | | - if isinstance(key, (tuple, list)): |
6347 | | - assert isinstance(value.schema, StructType) |
6348 | | - field_names = value.schema.fieldNames() |
| 6356 | + |
| 6357 | + if ((isinstance(value, Series) and value._kdf is not self) or |
| 6358 | + (isinstance(value, DataFrame) and value is not self)): |
| 6359 | + # Different (anchor) DataFrames |
| 6360 | + if isinstance(value, Series): |
| 6361 | + value = value.to_frame() |
| 6362 | + |
| 6363 | + if not isinstance(key, (tuple, list)): |
| 6364 | + key = [key] |
| 6365 | + |
| 6366 | + def assign_columns(kdf, this_columns, that_columns): |
| 6367 | + assert len(key) == len(that_columns) |
| 6368 | + # Note that here intentionally uses `zip_longest` that combine |
| 6369 | + # that_columns. |
| 6370 | + for k, this_column, that_column in zip_longest(key, this_columns, that_columns): |
| 6371 | + yield kdf[that_column].rename(k) |
| 6372 | + if this_column is not None: |
| 6373 | + # if both're same columns first one is higher priority. |
| 6374 | + yield kdf[this_column] |
| 6375 | + |
| 6376 | + kdf = align_diff_frames( |
| 6377 | + assign_columns, self, value, fillna=False, |
| 6378 | + how="left", include_all_that_columns=True) |
| 6379 | + elif isinstance(key, (tuple, list)): |
| 6380 | + assert isinstance(value, DataFrame) |
| 6381 | + # Same DataFrames. |
| 6382 | + field_names = value.columns |
6349 | 6383 | kdf = self.assign(**{k: value[c] for k, c in zip(key, field_names)}) |
6350 | 6384 | else: |
| 6385 | + # Same anchor DataFrames. |
6351 | 6386 | kdf = self.assign(**{key: value}) |
6352 | 6387 |
|
6353 | 6388 | self._internal = kdf._internal |
|
0 commit comments