Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
repos:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I was thinking about using .git/hooks/pre-commit right away rather then using a dependency for now.

- repo: https://github.com/psf/black
rev: stable
hooks:
- id: black
language_version: python3.6
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ matrix:
- PATH=$(echo "$PATH" | sed -e 's/:\/usr\/local\/lib\/jvm\/openjdk11\/bin//')
- JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
- SPARK_VERSION=2.4.5
- BLACK_VERSION=19.10b0
- PANDAS_VERSION=0.24.2
- PYARROW_VERSION=0.13.0
- KOALAS_USAGE_LOGGER='databricks.koalas.usage_logging.usage_logger'
Expand All @@ -36,6 +37,7 @@ matrix:
- PATH=$(echo "$PATH" | sed -e 's/:\/usr\/local\/lib\/jvm\/openjdk11\/bin//')
- JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
- SPARK_VERSION=2.4.5
- BLACK_VERSION=19.10b0
- PANDAS_VERSION=0.25.3
- PYARROW_VERSION=0.14.1

Expand Down Expand Up @@ -76,7 +78,7 @@ install:
conda config --env --add pinned_packages python=$TRAVIS_PYTHON_VERSION && \
conda config --env --add pinned_packages pandas==$PANDAS_VERSION && \
conda config --env --add pinned_packages pyarrow==$PYARROW_VERSION && \
conda install -c conda-forge --yes pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION && \
conda install -c conda-forge --yes black==$BLACK_VERSION pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION && \
conda install -c conda-forge --yes --freeze-installed --file requirements-dev.txt && \
conda list;
fi
Expand Down
2 changes: 1 addition & 1 deletion databricks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
#

# https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages
__path__ = __import__('pkgutil').extend_path(__path__, __name__) # type: ignore
__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore
20 changes: 9 additions & 11 deletions databricks/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@
from databricks.koalas import utils


shared_conf = {
"spark.sql.shuffle.partitions": "4"
}
shared_conf = {"spark.sql.shuffle.partitions": "4"}
# Initialize Spark session that should be used in doctests or unittests.
# Delta requires Spark 2.4.2+. See
# https://github.com/delta-io/delta#compatibility-with-apache-spark-versions.
Expand All @@ -48,7 +46,7 @@
session = utils.default_session(shared_conf)


@pytest.fixture(scope='session', autouse=True)
@pytest.fixture(scope="session", autouse=True)
def session_termination():
yield
# Share one session across all the tests. Repeating starting and stopping sessions and contexts
Expand All @@ -58,46 +56,46 @@ def session_termination():

@pytest.fixture(autouse=True)
def add_ks(doctest_namespace):
doctest_namespace['ks'] = koalas
doctest_namespace["ks"] = koalas


@pytest.fixture(autouse=True)
def add_pd(doctest_namespace):
if os.getenv("PANDAS_VERSION", None) is not None:
assert pd.__version__ == os.getenv("PANDAS_VERSION")
doctest_namespace['pd'] = pd
doctest_namespace["pd"] = pd


@pytest.fixture(autouse=True)
def add_pa(doctest_namespace):
if os.getenv("PYARROW_VERSION", None) is not None:
assert pa.__version__ == os.getenv("PYARROW_VERSION")
doctest_namespace['pa'] = pa
doctest_namespace["pa"] = pa


@pytest.fixture(autouse=True)
def add_np(doctest_namespace):
doctest_namespace['np'] = numpy
doctest_namespace["np"] = numpy


@pytest.fixture(autouse=True)
def add_path(doctest_namespace):
path = tempfile.mkdtemp()
atexit.register(lambda: shutil.rmtree(path, ignore_errors=True))
doctest_namespace['path'] = path
doctest_namespace["path"] = path


@pytest.fixture(autouse=True)
def add_db(doctest_namespace):
db_name = "db%s" % str(uuid.uuid4()).replace("-", "")
session.sql("CREATE DATABASE %s" % db_name)
atexit.register(lambda: session.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name))
doctest_namespace['db'] = db_name
doctest_namespace["db"] = db_name


@pytest.fixture(autouse=os.getenv("KOALAS_USAGE_LOGGER", None) is not None)
def add_caplog(caplog):
with caplog.at_level(logging.INFO, logger='databricks.koalas.usage_logger'):
with caplog.at_level(logging.INFO, logger="databricks.koalas.usage_logger"):
yield


Expand Down
71 changes: 53 additions & 18 deletions databricks/koalas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,33 @@

def assert_pyspark_version():
import logging

pyspark_ver = None
try:
import pyspark
except ImportError:
raise ImportError('Unable to import pyspark - consider doing a pip install with [spark] '
'extra to install pyspark with pip')
raise ImportError(
"Unable to import pyspark - consider doing a pip install with [spark] "
"extra to install pyspark with pip"
)
else:
pyspark_ver = getattr(pyspark, '__version__')
if pyspark_ver is None or pyspark_ver < '2.4':
pyspark_ver = getattr(pyspark, "__version__")
if pyspark_ver is None or pyspark_ver < "2.4":
logging.warning(
'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'
.format(pyspark_ver if pyspark_ver is not None else '<unknown version>'))
'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format(
pyspark_ver if pyspark_ver is not None else "<unknown version>"
)
)


assert_pyspark_version()

import pyspark
import pyarrow

if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and \
LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and LooseVersion(
pyspark.__version__
) < LooseVersion("3.0"):
# This is required to support PyArrow 0.15 in PySpark versions lower than 3.0.
# See SPARK-29367.
os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
Expand All @@ -53,10 +59,31 @@ def assert_pyspark_version():
from databricks.koalas.config import get_option, set_option, reset_option, options
from databricks.koalas.groupby import NamedAgg

__all__ = ['read_csv', 'read_parquet', 'to_datetime', 'from_pandas',
'get_dummies', 'DataFrame', 'Series', 'Index', 'MultiIndex', 'pandas_wraps',
'sql', 'range', 'concat', 'melt', 'get_option', 'set_option', 'reset_option',
'read_sql_table', 'read_sql_query', 'read_sql', 'options', 'option_context', 'NamedAgg']
__all__ = [
"read_csv",
"read_parquet",
"to_datetime",
"from_pandas",
"get_dummies",
"DataFrame",
"Series",
"Index",
"MultiIndex",
"pandas_wraps",
"sql",
"range",
"concat",
"melt",
"get_option",
"set_option",
"reset_option",
"read_sql_table",
"read_sql_query",
"read_sql",
"options",
"option_context",
"NamedAgg",
]


def _auto_patch():
Expand All @@ -68,21 +95,29 @@ def _auto_patch():
if logger_module is not None:
try:
from databricks.koalas import usage_logging

usage_logging.attach(logger_module)
except Exception as e:
from pyspark.util import _exception_message
logger = logging.getLogger('databricks.koalas.usage_logger')
logger.warning('Tried to attach usage logger `{}`, but an exception was raised: {}'
.format(logger_module, _exception_message(e)))

logger = logging.getLogger("databricks.koalas.usage_logger")
logger.warning(
"Tried to attach usage logger `{}`, but an exception was raised: {}".format(
logger_module, _exception_message(e)
)
)

# Autopatching is on by default.
x = os.getenv("SPARK_KOALAS_AUTOPATCH", "true")
if x.lower() in ("true", "1", "enabled"):
logger = logging.getLogger('spark')
logger.info("Patching spark automatically. You can disable it by setting "
"SPARK_KOALAS_AUTOPATCH=false in your environment")
logger = logging.getLogger("spark")
logger.info(
"Patching spark automatically. You can disable it by setting "
"SPARK_KOALAS_AUTOPATCH=false in your environment"
)

from pyspark.sql import dataframe as df

df.DataFrame.to_koalas = DataFrame.to_koalas


Expand Down
Loading