diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 879bd24a622..e67a4a5b107 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,6 @@
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f656703c550..955ccb4c99e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ jobs:
node-version: "10.x"
- run: npm install --save-dev @commitlint/{config-conventional,cli} commitlint-plugin-jira-rules commitlint-config-jira
- name: Add dependencies for commitlint action
- run: echo "::set-env name=NODE_PATH::$GITHUB_WORKSPACE/node_modules"
+ run: echo "NODE_PATH=$GITHUB_WORKSPACE/node_modules" >> $GITHUB_ENV
- run: git remote add upstream https://github.com/modin-project/modin.git
- run: git fetch upstream
- run: npx commitlint --from upstream/master --to HEAD --verbose
@@ -32,7 +32,7 @@ jobs:
python-version: "3.7.x"
architecture: "x64"
- run: pip install black
- - run: black --check --diff modin/
+ - run: black --check --diff modin/ asv_bench/benchmarks
lint-pydocstyle:
name: lint (pydocstyle)
@@ -51,6 +51,7 @@ jobs:
- run: pydocstyle --convention=numpy --add-ignore=D101,D102 modin/pandas/series_utils.py
- run: pydocstyle --convention=numpy --add-ignore=D103 modin/pandas/general.py
- run: pydocstyle --convention=numpy modin/pandas/plotting.py modin/pandas/utils.py modin/pandas/iterator.py modin/pandas/indexing.py
+ - run: pydocstyle --convention=numpy --add-ignore=D100,D104 modin/engines/base/frame
lint-flake8:
name: lint (flake8)
@@ -64,53 +65,19 @@ jobs:
python-version: "3.7.x"
architecture: "x64"
- run: pip install flake8 flake8-print
- - run: flake8 --enable=T modin
-
- prepare-cache:
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- os: ["ubuntu-latest", "windows-latest"]
- python-version: ["3.6", "3.7", "3.8"]
- steps:
- - uses: actions/checkout@v2
- with:
- fetch-depth: 1
- - name: Cache pip if Ubuntu
- if: startsWith(runner.os, 'Linux')
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - name: Cache pip if Windows
- if: startsWith(runner.os, 'Windows')
- uses: actions/cache@v1
- with:
- path: ~\AppData\Local\pip\Cache
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: actions/setup-python@v2
- with:
- python-version: ${{matrix.python-version}}
- architecture: "x64"
- - run: pip install "ray>=1.0.0"
+ - run: flake8 --enable=T modin/ asv_bench/benchmarks
test-api:
- needs: prepare-cache
runs-on: ubuntu-latest
name: test api
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -129,23 +96,17 @@ jobs:
run: python -m pytest modin/test/test_backends_api.py
test-headers:
- needs: prepare-cache
runs-on: ubuntu-latest
name: test-headers
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
- python-version: 3.6
+ environment-file: environment-dev.yml
+ python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda environment
@@ -157,6 +118,44 @@ jobs:
shell: bash -l {0}
run: python -m pytest modin/test/test_headers.py
+ test-clean-install-ubuntu:
+ needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
+ runs-on: ubuntu-latest
+ name: test-clean-install-ubuntu
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 1
+ - uses: actions/setup-python@v2
+ with:
+ python-version: "3.7.x"
+ architecture: "x64"
+ - name: Clean install and run
+ shell: bash -l {0}
+ run: |
+ python -m pip install -e .[all]
+ MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+ MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+
+ test-clean-install-windows:
+ needs: [ lint-commit, lint-flake8, lint-black, test-api, test-headers ]
+ runs-on: windows-latest
+ name: test-clean-install-windows
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 1
+ - uses: actions/setup-python@v2
+ with:
+ python-version: "3.7.x"
+ architecture: "x64"
+ - name: Clean install and run
+ shell: bash -l {0}
+ run: |
+ python -m pip install -e .[all]
+ MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+ MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+
test-internals:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
@@ -165,16 +164,11 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
- python-version: 3.6
+ environment-file: environment-dev.yml
+ python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda environment
@@ -189,6 +183,14 @@ jobs:
run: python -m pytest modin/config/test
- shell: bash -l {0}
run: python -m pytest modin/test/test_envvar_catcher.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/backends/base/test_internals.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/backends/pandas/test_internals.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/test_envvar_npartitions.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/test_partition_api.py
test-defaults:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
@@ -199,21 +201,16 @@ jobs:
env:
MODIN_MEMORY: 1000000000
MODIN_TEST_DATASET_SIZE: "small"
- name: Test ${{ matrix.backend }} backend, Python 3.6
+ name: Test ${{ matrix.backend }} backend, Python 3.7
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
- python-version: 3.6
+ environment-file: environment-dev.yml
+ python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda environment
@@ -223,6 +220,10 @@ jobs:
conda list
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
+ - shell: bash -l {0}
+ run: pytest modin/experimental/xgboost/test/test_default.py --backend=${{ matrix.backend }}
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/backends/base/test_internals.py --backend=${{ matrix.backend }}
- shell: bash -l {0}
run: pytest modin/pandas/test/dataframe/test_binary.py --backend=${{ matrix.backend }}
- shell: bash -l {0}
@@ -269,13 +270,8 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- name: Setting up Modin environment
- uses: goanpeca/setup-miniconda@v1.6.0
+ uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin_on_omnisci
python-version: 3.7.8
@@ -293,13 +289,64 @@ jobs:
run: pytest modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py
- shell: bash -l {0}
run: bash <(curl -s https://codecov.io/bash)
-
+
+ test-asv-benchmarks:
+ needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
+ runs-on: ubuntu-latest
+ env:
+ MODIN_ENGINE: ray
+ MODIN_MEMORY: 1000000000
+ MODIN_TEST_DATASET_SIZE: small
+ name: test-asv-benchmarks
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 1
+ - uses: conda-incubator/setup-miniconda@v2
+ with:
+ activate-environment: modin
+ environment-file: environment-dev.yml
+ python-version: 3.7
+ channel-priority: strict
+ use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
+ - name: Conda environment
+ shell: bash -l {0}
+ run: |
+ conda info
+ conda list
+
+ - name: Running benchmarks
+ shell: bash -l {0}
+ run: |
+ pip install -e .
+ cd asv_bench
+ asv check -E existing
+ git remote add upstream https://github.com/modin-project/modin.git
+ git fetch upstream
+ if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
+ asv machine --yes
+ asv run --quick --show-stderr --python=same --launch-method=spawn | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log
+ if grep "failed" benchmarks.log > /dev/null ; then
+ exit 1
+ fi
+ else
+ echo "Benchmarks did not run, no changes detected"
+ fi
+ if: always()
+
+ - name: Publish benchmarks artifact
+ uses: actions/upload-artifact@master
+ with:
+ name: Benchmarks log
+ path: asv_bench/benchmarks.log
+ if: failure()
+
test-all:
needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers]
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.6", "3.7", "3.8"]
+ python-version: ["3.7", "3.8"]
engine: ["python", "ray", "dask"]
env:
MODIN_ENGINE: ${{matrix.engine}}
@@ -309,15 +356,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -328,6 +370,8 @@ jobs:
conda list
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
+ - shell: bash -l {0}
+ run: pytest modin/experimental/xgboost/test/test_default.py
- shell: bash -l {0}
run: pytest modin/pandas/test/dataframe/test_binary.py
- shell: bash -l {0}
@@ -377,15 +421,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -415,15 +454,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -433,7 +467,7 @@ jobs:
conda info
conda list
- shell: bash -l {0}
- run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::test_from_csv
+ run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::TestCsv
- shell: bash -l {0}
run: bash <(curl -s https://codecov.io/bash)
@@ -442,7 +476,7 @@ jobs:
runs-on: windows-latest
strategy:
matrix:
- python-version: ["3.6", "3.7", "3.8"]
+ python-version: ["3.7", "3.8"]
engine: ["ray", "dask"]
part: ["DataFrame", 3]
env:
@@ -453,15 +487,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~\AppData\Local\pip\Cache
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -529,7 +558,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.6", "3.7", "3.8"]
+ python-version: ["3.7", "3.8"]
env:
MODIN_BACKEND: pyarrow
MODIN_EXPERIMENTAL: "True"
@@ -538,15 +567,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -557,4 +581,4 @@ jobs:
conda list
- run: sudo apt update && sudo apt install -y libhdf5-dev
- shell: bash -l {0}
- run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
+ run: python -m pytest modin/pandas/test/test_io.py::TestCsv
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index ff8677e4aa9..337a0a467dd 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -1,36 +1,40 @@
name: master
on: push
jobs:
- prepare-cache:
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- os: ["ubuntu-latest", "windows-latest"]
- python-version: ["3.6", "3.7", "3.8"]
+ test-internals:
+ runs-on: ubuntu-latest
+ name: test-internals
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip if Ubuntu
- if: startsWith(runner.os, 'Linux')
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - name: Cache pip if Windows
- if: startsWith(runner.os, 'Windows')
- uses: actions/cache@v1
+ - uses: conda-incubator/setup-miniconda@v2
with:
- path: ~\AppData\Local\pip\Cache
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: actions/setup-python@v2
- with:
- python-version: ${{matrix.python-version}}
- architecture: "x64"
- - run: pip install "ray>=1.0.0"
+ activate-environment: modin
+ environment-file: environment-dev.yml
+ python-version: 3.7
+ channel-priority: strict
+ use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
+ - name: Conda environment
+ shell: bash -l {0}
+ run: |
+ conda info
+ conda list
+ - name: Internals tests
+ shell: bash -l {0}
+ run: python -m pytest modin/data_management/factories/test/test_dispatcher.py modin/experimental/cloud/test/test_cloud.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/config/test
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/test_envvar_catcher.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/backends/pandas/test_internals.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/test_envvar_npartitions.py
+ - shell: bash -l {0}
+ run: python -m pytest modin/test/test_partition_api.py
test-defaults:
- needs: prepare-cache
runs-on: ubuntu-latest
strategy:
matrix:
@@ -38,21 +42,16 @@ jobs:
env:
MODIN_MEMORY: 1000000000
MODIN_TEST_DATASET_SIZE: "small"
- name: Test ${{ matrix.backend }} backend, Python 3.6
+ name: Test ${{ matrix.backend }} backend, Python 3.7
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
- python-version: 3.6
+ environment-file: environment-dev.yml
+ python-version: 3.7
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda environment
@@ -62,6 +61,8 @@ jobs:
conda list
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
+ - shell: bash -l {0}
+ run: pytest modin/experimental/xgboost/test/test_default.py --backend=${{ matrix.backend }}
- shell: bash -l {0}
run: pytest modin/pandas/test/dataframe/test_binary.py --backend=${{ matrix.backend }}
- shell: bash -l {0}
@@ -96,7 +97,6 @@ jobs:
run: bash <(curl -s https://codecov.io/bash)
test-omnisci:
- needs: prepare-cache
runs-on: ubuntu-latest
env:
MODIN_MEMORY: 1000000000
@@ -108,13 +108,8 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- name: Setting up Modin environment
- uses: goanpeca/setup-miniconda@v1.6.0
+ uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin_on_omnisci
python-version: 3.7.8
@@ -134,11 +129,10 @@ jobs:
run: bash <(curl -s https://codecov.io/bash)
test-all:
- needs: prepare-cache
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.6", "3.7", "3.8"]
+ python-version: ["3.7", "3.8"]
engine: ["python", "ray", "dask"]
env:
MODIN_ENGINE: ${{matrix.engine}}
@@ -148,15 +142,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -167,6 +156,8 @@ jobs:
conda list
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
+ - shell: bash -l {0}
+ run: pytest modin/experimental/xgboost/test/test_default.py
- shell: bash -l {0}
run: pytest modin/pandas/test/dataframe/test_binary.py
- shell: bash -l {0}
@@ -205,11 +196,10 @@ jobs:
run: bash <(curl -s https://codecov.io/bash)
test-windows:
- needs: prepare-cache
runs-on: windows-latest
strategy:
matrix:
- python-version: ["3.6", "3.7", "3.8"]
+ python-version: ["3.7", "3.8"]
engine: ["ray", "dask"]
part: ["DataFrame", 3]
env:
@@ -220,15 +210,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~\AppData\Local\pip\Cache
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -292,11 +277,10 @@ jobs:
run: codecov -f ./coverage.xml
test-pyarrow:
- needs: prepare-cache
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.6", "3.7", "3.8"]
+ python-version: ["3.7", "3.8"]
env:
MODIN_BACKEND: pyarrow
MODIN_EXPERIMENTAL: "True"
@@ -305,15 +289,10 @@ jobs:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- - name: Cache pip
- uses: actions/cache@v1
- with:
- path: ~/.cache/pip
- key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }}
- - uses: goanpeca/setup-miniconda@v1.6.0
+ - uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: modin
- environment-file: environment.yml
+ environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
@@ -324,4 +303,4 @@ jobs:
conda list
- run: sudo apt update && sudo apt install -y libhdf5-dev
- shell: bash -l {0}
- run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
+ run: python -m pytest modin/pandas/test/test_io.py::TestCsv
diff --git a/.gitignore b/.gitignore
index 678945c2776..4fd96ed4c0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,3 +172,7 @@ cscope.out
# Dask workspace
dask-worker-space/
node_modules
+
+# Asv stuff
+asv_bench/.asv/
+asv_bench/modin/
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 00000000000..5f08686586a
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,12 @@
+# These owners will be the default owners for everything in
+# the repo unless a later match takes precedence,
+* @modin-project/modin-core
+
+# These owners will review everything in the Omnisci engine component
+# of Modin.
+/modin/experimental/backends/omnisci/** @modin-project/modin-omnisci
+/modin/experimental/engines/omnisci_on_ray/** @modin-project/modin-omnisci
+
+# These owners will review everything related to the xgboost implementation
+# in Modin
+/modin/experimental/xgboost/** @modin-project/modin-xgboost
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000000..48b8c759233
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at conduct@gr-oss.io. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/README.md b/README.md
index aa2a1add823..b98b9d6e99a 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
-
+
To use Modin, replace the pandas import:
@@ -131,6 +131,8 @@ import numpy as np
frame_data = np.random.randint(0, 100, size=(2**10, 2**8))
df = pd.DataFrame(frame_data)
```
+**In local (without a cluster) modin will create and manage a local (dask or ray) cluster for the execution**
+
To use Modin, you do not need to know how many cores your system has and you do not need
to specify how to distribute the data. In fact, you can continue using your previous
@@ -179,8 +181,8 @@ and improve:

-Visit the [Documentation](https://modin.readthedocs.io/en/latest/architecture.html) for
-more information!
+Visit the [Documentation](https://modin.readthedocs.io/en/latest/developer/architecture.html) for
+more information, and checkout [the difference between Modin and Dask!](https://github.com/modin-project/modin/tree/master/docs/modin_vs_dask.md)
**`modin.pandas` is currently under active development. Requests and contributions are welcome!**
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
new file mode 100644
index 00000000000..fc7a3d99525
--- /dev/null
+++ b/asv_bench/asv.conf.json
@@ -0,0 +1,157 @@
+{
+ // The version of the config file format. Do not change, unless
+ // you know what you are doing.
+ "version": 1,
+
+ // The name of the project being benchmarked
+ "project": "modin",
+
+ // The project's homepage
+ "project_url": "https://modin.readthedocs.io/",
+
+ // The URL or local path of the source code repository for the
+ // project being benchmarked
+ "repo": "..",
+
+ // The Python project's subdirectory in your repo. If missing or
+ // the empty string, the project is assumed to be located at the root
+ // of the repository.
+ // "repo_subdir": "",
+
+ // Customizable commands for building, installing, and
+ // uninstalling the project. See asv.conf.json documentation.
+ //
+ "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}[ray]"],
+ // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+ // "build_command": [
+ // "python setup.py build",
+ // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+ // ],
+
+ // List of branches to benchmark. If not provided, defaults to "master"
+ // (for git) or "default" (for mercurial).
+ // "branches": ["master"], // for git
+ // "branches": ["default"], // for mercurial
+
+ // The DVCS being used. If not set, it will be automatically
+ // determined from "repo" by looking at the protocol in the URL
+ // (if remote), or by looking for special directories, such as
+ // ".git" (if local).
+ // "dvcs": "git",
+
+ // The tool to use to create environments. May be "conda",
+ // "virtualenv" or other value depending on the plugins in use.
+ // If missing or the empty string, the tool will be automatically
+ // determined by looking for tools on the PATH environment
+ // variable.
+ "environment_type": "conda",
+
+ // timeout in seconds for installing any dependencies in environment
+ // defaults to 10 min
+ //"install_timeout": 600,
+
+ // the base URL to show a commit for the project.
+ "show_commit_url": "https://github.com/modin-project/modin/commit/",
+
+ // The Pythons you'd like to test against. If not provided, defaults
+ // to the current version of Python used to run `asv`.
+ // "pythons": ["3.7"],
+
+ // The list of conda channel names to be searched for benchmark
+ // dependency packages in the specified order
+ "conda_channels": ["conda-forge", "defaults"],
+
+ // The matrix of dependencies to test. Each key is the name of a
+ // package (in PyPI) and the values are version numbers. An empty
+ // list or empty string indicates to just test against the default
+ // (latest) version. null indicates that the package is to not be
+ // installed. If the package to be tested is only available from
+ // PyPi, and the 'environment_type' is conda, then you can preface
+ // the package name by 'pip+', and the package will be installed via
+ // pip (with all the conda available packages installed first,
+ // followed by the pip installed packages).
+ // "matrix": {
+ // "pip+ray": ["1.0.1"],
+ // "pyarrow": ["1.0"]
+ // },
+ // Combinations of libraries/python versions can be excluded/included
+ // from the set to test. Each entry is a dictionary containing additional
+ // key-value pairs to include/exclude.
+ //
+ // An exclude entry excludes entries where all values match. The
+ // values are regexps that should match the whole string.
+ //
+ // An include entry adds an environment. Only the packages listed
+ // are installed. The 'python' key is required. The exclude rules
+ // do not apply to includes.
+ //
+ // In addition to package names, the following keys are available:
+ //
+ // - python
+ // Python version, as in the *pythons* variable above.
+ // - environment_type
+ // Environment type, as above.
+ // - sys_platform
+ // Platform, as in sys.platform. Possible values for the common
+ // cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+ //
+ // "exclude": [
+ // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+ // {"environment_type": "conda", "six": null}, // don't run without six on conda
+ // ],
+ //
+ // "include": [
+ // // additional env for python2.7
+ // {"python": "2.7", "numpy": "1.8"},
+ // // additional env if run on windows+conda
+ // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+ // ],
+
+ // The directory (relative to the current directory) that benchmarks are
+ // stored in. If not provided, defaults to "benchmarks"
+ // "benchmark_dir": "benchmarks",
+
+ // The directory (relative to the current directory) to cache the Python
+ // environments in. If not provided, defaults to "env"
+ "env_dir": ".asv/env",
+
+ // The directory (relative to the current directory) that raw benchmark
+ // results are stored in. If not provided, defaults to "results".
+ "results_dir": ".asv/results",
+
+ // The directory (relative to the current directory) that the html tree
+ // should be written to. If not provided, defaults to "html".
+ "html_dir": ".asv/html",
+
+ // The number of characters to retain in the commit hashes.
+ // "hash_length": 8,
+
+ // `asv` will cache results of the recent builds in each
+ // environment, making them faster to install next time. This is
+ // the number of builds to keep, per environment.
+ // "build_cache_size": 2,
+
+ // The commits after which the regression search in `asv publish`
+ // should start looking for regressions. Dictionary whose keys are
+ // regexps matching to benchmark names, and values corresponding to
+ // the commit (exclusive) after which to start looking for
+ // regressions. The default is to start from the first commit
+ // with results. If the commit is `null`, regression detection is
+ // skipped for the matching benchmark.
+ //
+ // "regressions_first_commits": {
+ // "some_benchmark": "352cdf", // Consider regressions only after this commit
+ // "another_benchmark": null, // Skip regression detection altogether
+ // },
+
+ // The thresholds for relative change in results, after which `asv
+ // publish` starts reporting regressions. Dictionary of the same
+ // form as in ``regressions_first_commits``, with values
+ // indicating the thresholds. If multiple entries match, the
+ // maximum is taken. If no entry matches, the default is 5%.
+ //
+ // "regressions_thresholds": {
+ // "some_benchmark": 0.01, // Threshold of 1%
+ // "another_benchmark": 0.5, // Threshold of 50%
+ // },
+}
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
new file mode 100644
index 00000000000..db841befebe
--- /dev/null
+++ b/asv_bench/benchmarks/__init__.py
@@ -0,0 +1,14 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+"""Modin benchmarks"""
diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
new file mode 100644
index 00000000000..32d3dfc2913
--- /dev/null
+++ b/asv_bench/benchmarks/benchmarks.py
@@ -0,0 +1,559 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+# define `MODIN_CPUS` env var to control the number of partitions
+# it should be defined before modin.pandas import (in case of using os.environ)
+
+# define `MODIN_ASV_USE_IMPL` env var to choose library for using in performance
+# measurements
+
+import modin.pandas as pd
+import numpy as np
+
+from .utils import (
+ generate_dataframe,
+ RAND_LOW,
+ RAND_HIGH,
+ random_string,
+ random_columns,
+ random_booleans,
+ ASV_USE_IMPL,
+ ASV_DATASET_SIZE,
+ BINARY_OP_DATA_SIZE,
+ UNARY_OP_DATA_SIZE,
+ GROUPBY_NGROUPS,
+ IMPL,
+ execute,
+)
+
+
+class BaseTimeGroupBy:
+ def setup(self, shape, ngroups=5, groupby_ncols=1):
+ if callable(ngroups):
+ ngroups = ngroups(shape[0])
+ self.df, self.groupby_columns = generate_dataframe(
+ ASV_USE_IMPL,
+ "int",
+ *shape,
+ RAND_LOW,
+ RAND_HIGH,
+ groupby_ncols,
+ count_groups=ngroups,
+ )
+
+
+class TimeGroupByMultiColumn(BaseTimeGroupBy):
+ param_names = ["shape", "ngroups", "groupby_ncols"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ GROUPBY_NGROUPS[ASV_DATASET_SIZE],
+ [6],
+ ]
+
+ def time_groupby_agg_quan(self, *args, **kwargs):
+ execute(self.df.groupby(by=self.groupby_columns).agg("quantile"))
+
+ def time_groupby_agg_mean(self, *args, **kwargs):
+ execute(self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean()))
+
+
+class TimeGroupByDefaultAggregations(BaseTimeGroupBy):
+ param_names = ["shape", "ngroups"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ GROUPBY_NGROUPS[ASV_DATASET_SIZE],
+ ]
+
+ def time_groupby_count(self, *args, **kwargs):
+ execute(self.df.groupby(by=self.groupby_columns).count())
+
+ def time_groupby_size(self, *args, **kwargs):
+ execute(self.df.groupby(by=self.groupby_columns).size())
+
+ def time_groupby_sum(self, *args, **kwargs):
+ execute(self.df.groupby(by=self.groupby_columns).sum())
+
+ def time_groupby_mean(self, *args, **kwargs):
+ execute(self.df.groupby(by=self.groupby_columns).mean())
+
+
+class TimeGroupByDictionaryAggregation(BaseTimeGroupBy):
+ param_names = ["shape", "ngroups", "operation_type"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ GROUPBY_NGROUPS[ASV_DATASET_SIZE],
+ ["reduction", "aggregation"],
+ ]
+ operations = {
+ "reduction": ["sum", "count", "prod"],
+ "aggregation": ["quantile", "std", "median"],
+ }
+
+ def setup(self, shape, ngroups, operation_type):
+ super().setup(shape, ngroups)
+ self.cols_to_agg = self.df.columns[1:4]
+ operations = self.operations[operation_type]
+ self.agg_dict = {
+ c: operations[i % len(operations)] for i, c in enumerate(self.cols_to_agg)
+ }
+
+ def time_groupby_dict_agg(self, *args, **kwargs):
+ execute(self.df.groupby(by=self.groupby_columns).agg(self.agg_dict))
+
+
+class TimeJoin:
+ param_names = ["shapes", "how", "sort"]
+ params = [
+ BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ ["left", "inner"],
+ [False],
+ ]
+
+ def setup(self, shapes, how, sort):
+ self.df1 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
+ )
+ self.df2 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
+ )
+
+ def time_join(self, shapes, how, sort):
+ # join dataframes on index to get the predictable shape
+ execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))
+
+
+class TimeMerge:
+ param_names = ["shapes", "how", "sort"]
+ params = [
+ BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ ["left", "inner"],
+ [False],
+ ]
+
+ def setup(self, shapes, how, sort):
+ self.df1 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
+ )
+ self.df2 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
+ )
+
+ def time_merge(self, shapes, how, sort):
+ # merge dataframes by index to get the predictable shape
+ execute(
+ self.df1.merge(
+ self.df2, left_index=True, right_index=True, how=how, sort=sort
+ )
+ )
+
+
+class TimeConcat:
+ param_names = ["shapes", "how", "axis"]
+ params = [
+ BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ ["inner"],
+ [0, 1],
+ ]
+
+ def setup(self, shapes, how, axis):
+ self.df1 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
+ )
+ self.df2 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
+ )
+
+ def time_concat(self, shapes, how, axis):
+ execute(IMPL[ASV_USE_IMPL].concat([self.df1, self.df2], axis=axis, join=how))
+
+
+class TimeAppend:
+ param_names = ["shapes", "sort"]
+ params = [
+ BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [False, True],
+ ]
+
+ def setup(self, shapes, sort):
+ self.df1 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
+ )
+ self.df2 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
+ )
+ if sort:
+ self.df1.columns = self.df1.columns[::-1]
+
+ def time_append(self, shapes, sort):
+ execute(self.df1.append(self.df2, sort=sort))
+
+
+class TimeBinaryOp:
+ param_names = ["shapes", "binary_op", "axis"]
+ params = [
+ BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ ["mul"],
+ [0, 1],
+ ]
+
+ def setup(self, shapes, binary_op, axis):
+ self.df1 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
+ )
+ self.df2 = generate_dataframe(
+ ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
+ )
+ self.op = getattr(self.df1, binary_op)
+
+ def time_binary_op(self, shapes, binary_op, axis):
+ execute(self.op(self.df2, axis=axis))
+
+
+class BaseTimeSetItem:
+ param_names = ["shape", "item_length", "loc", "is_equal_indices"]
+
+ @staticmethod
+ def get_loc(df, loc, axis, item_length):
+ locs_dict = {
+ "zero": 0,
+ "middle": len(df.axes[axis]) // 2,
+ "last": len(df.axes[axis]) - 1,
+ }
+ base_loc = locs_dict[loc]
+ range_based_loc = np.arange(
+ base_loc, min(len(df.axes[axis]), base_loc + item_length)
+ )
+ return (
+ (df.axes[axis][base_loc], base_loc)
+ if len(range_based_loc) == 1
+ else (df.axes[axis][range_based_loc], range_based_loc)
+ )
+
+ def setup(self, shape, item_length, loc, is_equal_indices):
+ self.df = generate_dataframe(
+ ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH
+ ).copy()
+ self.loc, self.iloc = self.get_loc(
+ self.df, loc, item_length=item_length, axis=1
+ )
+
+ self.item = self.df[self.loc] + 1
+ self.item_raw = self.item.to_numpy()
+ if not is_equal_indices:
+ self.item.index = reversed(self.item.index)
+
+
+class TimeSetItem(BaseTimeSetItem):
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [1],
+ ["zero", "middle", "last"],
+ [True, False],
+ ]
+
+ def time_setitem_qc(self, *args, **kwargs):
+ self.df[self.loc] = self.item
+ execute(self.df)
+
+ def time_setitem_raw(self, *args, **kwargs):
+ self.df[self.loc] = self.item_raw
+ execute(self.df)
+
+
+class TimeInsert(BaseTimeSetItem):
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [1],
+ ["zero", "middle", "last"],
+ [True, False],
+ ]
+
+ def time_insert_qc(self, *args, **kwargs):
+ self.df.insert(loc=self.iloc, column=random_string(), value=self.item)
+ execute(self.df)
+
+ def time_insert_raw(self, *args, **kwargs):
+ self.df.insert(loc=self.iloc, column=random_string(), value=self.item_raw)
+ execute(self.df)
+
+
+class TimeArithmetic:
+ param_names = ["shape", "axis"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [0, 1],
+ ]
+
+ def setup(self, shape, axis):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+
+ def time_sum(self, shape, axis):
+ execute(self.df.sum(axis=axis))
+
+ def time_median(self, shape, axis):
+ execute(self.df.median(axis=axis))
+
+ def time_nunique(self, shape, axis):
+ execute(self.df.nunique(axis=axis))
+
+ def time_apply(self, shape, axis):
+ execute(self.df.apply(lambda df: df.sum(), axis=axis))
+
+ def time_mean(self, shape, axis):
+ execute(self.df.mean(axis=axis))
+
+
+class TimeSortValues:
+ param_names = ["shape", "columns_number", "ascending_list"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [1, 2, 10, 100],
+ [False, True],
+ ]
+
+ def setup(self, shape, columns_number, ascending_list):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+ self.columns = random_columns(self.df.columns, columns_number)
+ self.ascending = (
+ random_booleans(columns_number)
+ if ascending_list
+ else bool(random_booleans(1)[0])
+ )
+
+ def time_sort_values(self, shape, columns_number, ascending_list):
+ execute(self.df.sort_values(self.columns, ascending=self.ascending))
+
+
+class TimeDrop:
+ param_names = ["shape", "axis", "drop_ncols"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [0, 1],
+ [1, 0.8],
+ ]
+
+ def setup(self, shape, axis, drop_ncols):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+ drop_count = (
+ int(len(self.df.axes[axis]) * drop_ncols)
+ if isinstance(drop_ncols, float)
+ else drop_ncols
+ )
+ self.labels = self.df.axes[axis][:drop_count]
+
+ def time_drop(self, shape, axis, drop_ncols):
+ execute(self.df.drop(self.labels, axis))
+
+
+class TimeHead:
+ param_names = ["shape", "head_count"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [5, 0.8],
+ ]
+
+ def setup(self, shape, head_count):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+ self.head_count = (
+ int(head_count * len(self.df.index))
+ if isinstance(head_count, float)
+ else head_count
+ )
+
+ def time_head(self, shape, head_count):
+ execute(self.df.head(self.head_count))
+
+
+class TimeFillna:
+ param_names = ["shape", "limit", "inplace"]
+ params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [None, 0.8], [False, True]]
+
+ def setup(self, shape, limit, inplace):
+ pd = IMPL[ASV_USE_IMPL]
+ columns = [f"col{x}" for x in range(shape[1])]
+ self.df = pd.DataFrame(np.nan, index=pd.RangeIndex(shape[0]), columns=columns)
+ self.limit = int(limit * shape[0]) if limit else None
+
+ def time_fillna(self, shape, limit, inplace):
+ kw = {"value": 0.0, "limit": self.limit, "inplace": inplace}
+ if inplace:
+ self.df.fillna(**kw)
+ execute(self.df)
+ else:
+ execute(self.df.fillna(**kw))
+
+
+class BaseTimeValueCounts:
+ subset_params = {
+ "all": lambda shape: shape[1],
+ "half": lambda shape: shape[1] // 2,
+ }
+
+ def setup(self, shape, ngroups=5, subset="all"):
+ try:
+ subset = self.subset_params[subset]
+ except KeyError:
+ raise KeyError(
+ f"Invalid value for 'subset={subset}'. Allowed: {list(self.subset_params.keys())}"
+ )
+ ncols = subset(shape)
+ self.df, _ = generate_dataframe(
+ ASV_USE_IMPL,
+ "int",
+ *shape,
+ RAND_LOW,
+ RAND_HIGH,
+ groupby_ncols=ncols,
+ count_groups=ngroups,
+ )
+ self.subset = self.df.columns[:ncols].tolist()
+
+
+class TimeValueCountsFrame(BaseTimeValueCounts):
+ param_names = ["shape", "ngroups", "subset"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ GROUPBY_NGROUPS[ASV_DATASET_SIZE],
+ ["all", "half"],
+ ]
+
+ def time_value_counts(self, *args, **kwargs):
+ execute(self.df.value_counts(subset=self.subset))
+
+
+class TimeValueCountsSeries(BaseTimeValueCounts):
+ param_names = ["shape", "ngroups", "bins"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ GROUPBY_NGROUPS[ASV_DATASET_SIZE],
+ [None, 3],
+ ]
+
+ def setup(self, shape, ngroups, bins):
+ super().setup(ngroups=ngroups, shape=shape)
+ self.df = self.df.iloc[:, 0]
+
+ def time_value_counts(self, shape, ngroups, bins):
+ execute(self.df.value_counts(bins=bins))
+
+
+class TimeIndexing:
+ param_names = ["shape", "indexer_type"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [
+ "scalar",
+ "bool",
+ "slice",
+ "list",
+ "function",
+ ],
+ ]
+
+ def setup(self, shape, indexer_type):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+ if indexer_type == "bool":
+ self.indexer = [False, True] * (shape[0] // 2)
+ elif indexer_type == "scalar":
+ self.indexer = shape[0] // 2
+ elif indexer_type == "slice":
+ self.indexer = slice(0, shape[0], 2)
+ elif indexer_type == "list":
+ self.indexer = [x for x in range(shape[0])]
+ elif indexer_type == "function":
+ self.indexer = lambda df: df.index[::-2]
+
+ def time_iloc(self, shape, indexer_type):
+ execute(self.df.iloc[self.indexer])
+
+ def time_loc(self, shape, indexer_type):
+ execute(self.df.loc[self.indexer])
+
+
+class TimeMultiIndexing:
+ param_names = ["shape"]
+ params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE]]
+
+ def setup(self, shape):
+ df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+
+ index = pd.MultiIndex.from_product([df.index[: shape[0] // 2], ["bar", "foo"]])
+ columns = pd.MultiIndex.from_product(
+ [df.columns[: shape[1] // 2], ["buz", "fuz"]]
+ )
+
+ df.index = index
+ df.columns = columns
+
+ self.df = df.sort_index(axis=1)
+
+ def time_multiindex_loc(self, shape):
+ execute(
+ self.df.loc[
+ self.df.index[2] : self.df.index[-2],
+ self.df.columns[2] : self.df.columns[-2],
+ ]
+ )
+
+
+class TimeAstype:
+ param_names = ["shape", "dtype", "astype_ncolumns"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ ["float64", "category"],
+ ["one", "all"],
+ ]
+
+ def setup(self, shape, dtype, astype_ncolumns):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+ if astype_ncolumns == "all":
+ self.astype_arg = dtype
+ elif astype_ncolumns == "one":
+ self.astype_arg = {"col1": dtype}
+ else:
+ raise ValueError("astype_ncolumns: {astype_ncolumns} isn't supported")
+
+ def time_astype(self, shape, dtype, astype_ncolumns):
+ execute(self.df.astype(self.astype_arg))
+
+
+class TimeDescribe:
+ param_names = ["shape"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ ]
+
+ def setup(self, shape):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+
+ def time_describe(self, shape):
+ execute(self.df.describe())
+
+
+class TimeProperties:
+ param_names = ["shape"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ ]
+
+ def setup(self, shape):
+ self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+
+ def time_shape(self, shape):
+ return self.df.shape
+
+ def time_columns(self, shape):
+ return self.df.columns
+
+ def time_index(self, shape):
+ return self.df.index
diff --git a/asv_bench/benchmarks/io/__init__.py b/asv_bench/benchmarks/io/__init__.py
new file mode 100644
index 00000000000..cae6413e559
--- /dev/null
+++ b/asv_bench/benchmarks/io/__init__.py
@@ -0,0 +1,12 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
new file mode 100644
index 00000000000..e2bcde46a6b
--- /dev/null
+++ b/asv_bench/benchmarks/io/csv.py
@@ -0,0 +1,67 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import modin.pandas as pd
+import numpy as np
+
+from ..utils import (
+ generate_dataframe,
+ RAND_LOW,
+ RAND_HIGH,
+ ASV_USE_IMPL,
+ ASV_DATASET_SIZE,
+ UNARY_OP_DATA_SIZE,
+ IMPL,
+ execute,
+ get_shape_id,
+)
+
+# ray init
+if ASV_USE_IMPL == "modin":
+ pd.DataFrame([])
+
+
+class BaseReadCsv:
+ # test data file can de created only once
+ def setup_cache(self, test_filename="io_test_file"):
+ test_filenames = {}
+ for shape in UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE]:
+ shape_id = get_shape_id(shape)
+ test_filenames[shape_id] = f"{test_filename}_{shape_id}.csv"
+ df = generate_dataframe("pandas", "str_int", *shape, RAND_LOW, RAND_HIGH)
+ df.to_csv(test_filenames[shape_id], index=False)
+
+ return test_filenames
+
+ def setup(self, test_filenames, shape, *args, **kwargs):
+ self.shape_id = get_shape_id(shape)
+
+
+class TimeReadCsvSkiprows(BaseReadCsv):
+ param_names = ["shape", "skiprows"]
+ params = [
+ UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
+ [
+ None,
+ lambda x: x % 2,
+ np.arange(1, UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE][0][0] // 10),
+ np.arange(1, UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE][0][0], 2),
+ ],
+ ]
+
+ def time_skiprows(self, test_filenames, shape, skiprows):
+ execute(
+ IMPL[ASV_USE_IMPL].read_csv(
+ test_filenames[self.shape_id], skiprows=skiprows
+ )
+ )
diff --git a/asv_bench/benchmarks/utils.py b/asv_bench/benchmarks/utils.py
new file mode 100644
index 00000000000..0e9a224e933
--- /dev/null
+++ b/asv_bench/benchmarks/utils.py
@@ -0,0 +1,215 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import os
+import logging
+import modin.pandas as pd
+import pandas
+import numpy as np
+import uuid
+
+RAND_LOW = 0
+RAND_HIGH = 100
+random_state = np.random.RandomState(seed=42)
+
+
+try:
+ from modin.config import NPartitions
+
+ NPARTITIONS = NPartitions.get()
+except ImportError:
+ NPARTITIONS = pd.DEFAULT_NPARTITIONS
+
+try:
+ from modin.config import TestDatasetSize, AsvImplementation
+
+ ASV_USE_IMPL = AsvImplementation.get()
+ ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
+except ImportError:
+ # The same benchmarking code can be run for different versions of Modin, so in
+ # case of an error importing important variables, we'll just use predefined values
+ ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
+ ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")
+
+assert ASV_USE_IMPL in ("modin", "pandas")
+
+BINARY_OP_DATA_SIZE = {
+ "Big": [
+ ((5000, 5000), (5000, 5000)),
+ # the case extremely inefficient
+ # ((20, 500_000), (10, 1_000_000)),
+ ((500_000, 20), (1_000_000, 10)),
+ ],
+ "Small": [
+ ((250, 250), (250, 250)),
+ ((20, 10_000), (10, 25_000)),
+ ((10_000, 20), (25_000, 10)),
+ ],
+}
+
+UNARY_OP_DATA_SIZE = {
+ "Big": [
+ (5000, 5000),
+ # the case extremely inefficient
+ # (10, 1_000_000),
+ (1_000_000, 10),
+ ],
+ "Small": [
+ (250, 250),
+ (10, 10_000),
+ (10_000, 10),
+ ],
+}
+
+GROUPBY_NGROUPS = {
+ "Big": [100, lambda nrows: min(nrows // 2, 5000)],
+ "Small": [5],
+}
+
+IMPL = {
+ "modin": pd,
+ "pandas": pandas,
+}
+
+
+class weakdict(dict):
+ __slots__ = ("__weakref__",)
+
+
+data_cache = dict()
+dataframes_cache = dict()
+
+
+def gen_int_data(nrows, ncols, rand_low, rand_high):
+ cache_key = ("int", nrows, ncols, rand_low, rand_high)
+ if cache_key in data_cache:
+ return data_cache[cache_key]
+
+ logging.info(
+ "Generating int data {} rows and {} columns [{}-{}]".format(
+ nrows, ncols, rand_low, rand_high
+ )
+ )
+ data = {
+ "col{}".format(i): random_state.randint(rand_low, rand_high, size=(nrows))
+ for i in range(ncols)
+ }
+ data_cache[cache_key] = weakdict(data)
+ return data
+
+
+def gen_str_int_data(nrows, ncols, rand_low, rand_high):
+ cache_key = ("str_int", nrows, ncols, rand_low, rand_high)
+ if cache_key in data_cache:
+ return data_cache[cache_key]
+
+ logging.info(
+ "Generating str_int data {} rows and {} columns [{}-{}]".format(
+ nrows, ncols, rand_low, rand_high
+ )
+ )
+ data = gen_int_data(nrows, ncols, rand_low, rand_high).copy()
+ data["gb_col"] = [
+ "str_{}".format(random_state.randint(rand_low, rand_high)) for i in range(nrows)
+ ]
+ data_cache[cache_key] = weakdict(data)
+ return data
+
+
+def gen_data(data_type, nrows, ncols, rand_low, rand_high):
+ if data_type == "int":
+ return gen_int_data(nrows, ncols, rand_low, rand_high)
+ elif data_type == "str_int":
+ return gen_str_int_data(nrows, ncols, rand_low, rand_high)
+ else:
+ assert False
+
+
+def generate_dataframe(
+ impl,
+ data_type,
+ nrows,
+ ncols,
+ rand_low,
+ rand_high,
+ groupby_ncols=None,
+ count_groups=None,
+):
+ assert not (
+ (groupby_ncols is None) ^ (count_groups is None)
+ ), "You must either specify both parameters 'groupby_ncols' and 'count_groups' or none of them."
+
+ if groupby_ncols and count_groups:
+ ncols -= groupby_ncols
+ cache_key = (
+ impl,
+ data_type,
+ nrows,
+ ncols,
+ rand_low,
+ rand_high,
+ groupby_ncols,
+ count_groups,
+ )
+ else:
+ cache_key = (impl, data_type, nrows, ncols, rand_low, rand_high)
+
+ if cache_key in dataframes_cache:
+ return dataframes_cache[cache_key]
+
+ logging.info(
+ "Allocating {} DataFrame {}: {} rows and {} columns [{}-{}]".format(
+ impl, data_type, nrows, ncols, rand_low, rand_high
+ )
+ )
+ data = gen_data(data_type, nrows, ncols, rand_low, rand_high)
+
+ if groupby_ncols and count_groups:
+ groupby_columns = [f"groupby_col{x}" for x in range(groupby_ncols)]
+ for groupby_col in groupby_columns:
+ data[groupby_col] = np.tile(np.arange(count_groups), nrows // count_groups)
+
+ if impl == "modin":
+ df = pd.DataFrame(data)
+ elif impl == "pandas":
+ df = pandas.DataFrame(data)
+ else:
+ assert False
+
+ if groupby_ncols and count_groups:
+ dataframes_cache[cache_key] = df, groupby_columns
+ return df, groupby_columns
+
+ dataframes_cache[cache_key] = df
+ return df
+
+
+def random_string():
+ return str(uuid.uuid1())
+
+
+def random_columns(df_columns, columns_number):
+ return list(random_state.choice(df_columns, size=columns_number))
+
+
+def random_booleans(number):
+ return list(random_state.choice([True, False], size=number))
+
+
+def execute(df):
+ "Make sure the calculations are done."
+ return df.shape, df.dtypes
+
+
+def get_shape_id(array):
+ return "_".join([str(element) for element in array])
diff --git a/ci/jenkins/build-tests/Dockerfile b/ci/jenkins/build-tests/Dockerfile
index 37acb5aabac..51105ae5592 100644
--- a/ci/jenkins/build-tests/Dockerfile
+++ b/ci/jenkins/build-tests/Dockerfile
@@ -1,7 +1,7 @@
FROM python:3.6.6-stretch
-COPY requirements.txt requirements.txt
-RUN pip install -r requirements.txt
+COPY requirements-dev.txt requirements-dev.txt
+RUN pip install -r requirements-dev.txt
COPY . .
RUN pip install -e .
diff --git a/ci/jenkins/performance-tests/Dockerfile b/ci/jenkins/performance-tests/Dockerfile
index 118ac57176e..b94ff9ed940 100644
--- a/ci/jenkins/performance-tests/Dockerfile
+++ b/ci/jenkins/performance-tests/Dockerfile
@@ -1,7 +1,7 @@
FROM python:3.6.6-stretch
-COPY requirements.txt requirements.txt
-RUN pip install -r requirements.txt
+COPY requirements-dev.txt requirements-dev.txt
+RUN pip install -r requirements-dev.txt
RUN pip install -q pytest==3.9.3 awscli pytest-benchmark feather-format lxml openpyxl xlrd numpy matplotlib sqlalchemy
COPY . .
diff --git a/ci/teamcity/Dockerfile.modin-base b/ci/teamcity/Dockerfile.modin-base
deleted file mode 100644
index de4cdd5912e..00000000000
--- a/ci/teamcity/Dockerfile.modin-base
+++ /dev/null
@@ -1,5 +0,0 @@
-ARG BASE_IMAGE=ray-project/deploy
-
-FROM ${BASE_IMAGE}
-RUN conda update python -y
-
diff --git a/ci/teamcity/Dockerfile.teamcity-ci b/ci/teamcity/Dockerfile.teamcity-ci
index eec0b4c8c05..1e8058e23e5 100644
--- a/ci/teamcity/Dockerfile.teamcity-ci
+++ b/ci/teamcity/Dockerfile.teamcity-ci
@@ -1,6 +1,14 @@
-FROM modin-project/modin-base
+# Create images from this container like this (in modin repo root):
+#
+# git rev-parse HEAD > ci/teamcity/git-rev
+#
+# tar cf ci/teamcity/modin.tar .
+#
+# docker build --build-arg ENVIRONMENT=environment-dev.yml -t modin-project/teamcity-ci:${BUILD_NUMBER} -f ci/teamcity/Dockerfile.teamcity-ci ci/teamcity
-ARG ENVIRONMENT=environment.yml
+FROM rayproject/ray:1.0.1
+
+ARG ENVIRONMENT=environment-dev.yml
ADD modin.tar /modin
ADD git-rev /modin/git-rev
@@ -10,13 +18,21 @@ WORKDIR /modin
# Make RUN commands use `bash --login`:
SHELL ["/bin/bash", "--login", "-c"]
-RUN conda env create -f ${ENVIRONMENT}
-
-# Initialize conda in bash config fiiles:
+# Initialize conda in bash config files:
RUN conda init bash
-ENV PATH /opt/conda/envs/modin/bin:$PATH
+ENV PATH /root/anaconda3/envs/modin/bin:$PATH
+
+RUN conda update python -y
+RUN conda env create -f ${ENVIRONMENT}
+RUN conda install curl PyGithub
# Activate the environment, and make sure it's activated:
+# The following line also removed conda initialization from
+# ~/.bashrc so conda starts complaining that it should be
+# initialized for bash. But it is necessary to do it because
+# activation is not always executed when "docker exec" is used
+# and then conda initialization overwrites PATH with its base
+# environment where python doesn't have any packages installed.
RUN echo "conda activate modin" > ~/.bashrc
RUN echo "Make sure environment is activated"
-RUN conda list
+RUN conda list -n modin
diff --git a/ci/teamcity/build-docker.py b/ci/teamcity/build-docker.py
index 45525e41c15..cfcb0537822 100644
--- a/ci/teamcity/build-docker.py
+++ b/ci/teamcity/build-docker.py
@@ -15,7 +15,7 @@ def execute_command(cmd):
"(cd ../.. && git archive -o ci/teamcity/modin.tar $(cat ci/teamcity/git-rev))"
)
base_image = "ray-project/deploy"
- requirements = "requirements.txt"
+ requirements = "requirements-dev.txt"
execute_command(
"docker build -f Dockerfile.modin-base --build-arg BASE_IMAGE={} -t modin-project/modin-base .".format(
base_image
diff --git a/commitlint.config.js b/commitlint.config.js
index 6c95efaef68..d730fa765a0 100644
--- a/commitlint.config.js
+++ b/commitlint.config.js
@@ -2,7 +2,7 @@ module.exports = {
plugins: ['commitlint-plugin-jira-rules'],
extends: ['jira'],
rules: {
- "header-max-length": [2, "always", 70],
+ "header-max-length": [2, "always", 88],
"signed-off-by": [2, "always", "Signed-off-by"],
"jira-task-id-max-length": [0, "always", 10],
"jira-task-id-project-key": [2, "always", ["FEAT", "DOCS", "FIX", "REFACTOR", "TEST"]],
diff --git a/docs/UsingSQLonRay/index.rst b/docs/UsingSQLonRay/index.rst
index 82f3fb7b2a0..5248099ae15 100644
--- a/docs/UsingSQLonRay/index.rst
+++ b/docs/UsingSQLonRay/index.rst
@@ -10,7 +10,7 @@ Our plans with the SQL API for Modin are to create an interface that allows you
intermix SQL and pandas operations without copying the entire dataset into a new
structure between the two. This is possible due to the architecture of Modin. Currently,
Modin has a query compiler that acts as an intermediate layer between the query language
-(e.g. SQL, pandas) and the execution (See architecture_ documentation for details).
+(e.g. SQL, pandas) and the execution (See :doc:`architecture ` documentation for details).
*We have implemented a simple example that can be found below. Feedback welcome!*
@@ -29,5 +29,3 @@ Modin has a query compiler that acts as an intermediate layer between the query
col1 col2 column 3 col4
0 1 2.0 A String of information True
1 6 17.0 A String of different information False
-
-.. _architecture: https://modin.readthedocs.io/en/latest/architecture.html
diff --git a/docs/comparisons/dask.rst b/docs/comparisons/dask.rst
new file mode 100644
index 00000000000..82627916413
--- /dev/null
+++ b/docs/comparisons/dask.rst
@@ -0,0 +1,90 @@
+Modin vs. Dask Dataframe
+========================
+
+Dask's Dataframe is effectively a meta-frame, partitioning and scheduling many smaller
+``pandas.DataFrame`` objects. The Dask DataFrame does not implement the entire pandas
+API, and it isn't trying to. See this explained in the `Dask DataFrame documentation`_.
+
+**The TL;DR is that Modin's API is identical to pandas, whereas Dask's is not. Note: The
+projects are fundamentally different in their aims, so a fair comparison is
+challenging.**
+
+API
+---
+The API of Modin and Dask are different in several ways, explained here.
+
+Dask DataFrame
+""""""""""""""
+
+Dask is currently missing multiple APIs from pandas that Modin has implemented. Of note:
+Dask does not implement ``iloc``, ``MultiIndex``, ``apply(axis=0)``, ``quantile``,
+``median``, and more. Some of these APIs cannot be implemented efficiently or at all
+given the architecture design tradeoffs made in Dask's implementation, and others simply
+require engineering effort. ``iloc``, for example, can be implemented, but it would be
+inefficient, and ``apply(axis=0)`` cannot be implemented at all in Dask's architecture.
+
+Dask DataFrames API is also different from the pandas API in that it is lazy and needs
+``.compute()`` calls to materialize the DataFrame. This makes the API less convenient
+but allows Dask to do certain query optimizations/rearrangement, which can give speedups
+in certain situations. Several additional APIs exist in the Dask DataFrame API that
+expose internal state about how the data is chunked and other data layout details, and
+ways to manipulate that state.
+
+Semantically, Dask sorts the ``index``, which does not allow for user-specified order.
+In Dask's case, this was done for optimization purposes, to speed up other computations
+which involve the row index.
+
+Modin
+"""""
+
+Modin is targeted toward parallelizing the entire pandas API, without exception.
+As the pandas API continues to evolve, so will Modin's pandas API. Modin is intended to
+be used as a drop-in replacement for pandas, such that even if the API is not yet
+parallelized, it still works by falling back to running pandas. One of the key features
+of being a drop-in replacement is that not only will it work for existing code, if a
+user wishing to go back to running pandas directly, they may at no cost. There's no
+lock-in: Modin notebooks can be converted to and from pandas as the user prefers.
+
+In the long-term, Modin is planned to become a data science framework that supports all
+popular APIs (SQL, pandas, etc.) with the same underlying execution.
+
+Architecture
+------------
+
+The differences in Modin and Dask's architectures are explained in this section.
+
+Dask DataFrame
+""""""""""""""
+
+Dask DataFrame uses row-based partitioning, similar to Spark. This can be seen in their
+`documentation`_. They also have a custom index object for indexing into the object,
+which is not pandas compatible. Dask DataFrame seems to treat operations on the
+DataFrame as MapReduce operations, which is a good paradigm for the subset of the pandas
+API they have chosen to implement, but makes certain operations impossible. Dask
+Dataframe is also lazy and places a lot of partitioning responsibility on the user.
+
+Modin
+"""""
+
+Modin's partition is much more flexible, so the system can scale in both directions and
+have finer grained partitioning. This is explained at a high level in `Modin's
+documentation`_. Because we have this finer grained control over the partitioning, we
+can support a number of operations that are very challenging in MapReduce systems (e.g.
+transpose, median, quantile). This flexibility in partitioning also gives Modin
+tremendous power to implement efficient straggler mitigation and improvements in
+utilization over the entire cluster.
+
+Modin is also architected to run on a variety of systems. The goal here is that users
+can take the same notebook to different clusters or different environments and it will
+still just work, run on what you have! Modin does support running on Dask's compute
+engine in addition to Ray. The architecture of Modin is extremely modular, we are able
+to add different execution engines or compile to different memory formats because of
+this modularity. Modin can run on a Dask cluster in the same way that Dask Dataframe
+can, but they will still be different in all of the ways described above.
+
+Modin's implementation is grounded in theory, which is what enables us to implement the
+entire pandas API.
+
+.. _Dask DataFrame documentation: http://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses
+.. _documentation: http://docs.dask.org/en/latest/dataframe.html#design.
+.. _Modin's documentation: https://modin.readthedocs.io/en/latest/developer/architecture.html
diff --git a/docs/comparisons/index.rst b/docs/comparisons/index.rst
new file mode 100644
index 00000000000..40647d065d9
--- /dev/null
+++ b/docs/comparisons/index.rst
@@ -0,0 +1,4 @@
+How is Modin unique?
+====================
+
+Coming Soon...
diff --git a/docs/comparisons/pandas.rst b/docs/comparisons/pandas.rst
new file mode 100644
index 00000000000..29e4749b798
--- /dev/null
+++ b/docs/comparisons/pandas.rst
@@ -0,0 +1,69 @@
+Modin vs. pandas
+================
+
+Modin exposes the pandas API through ``modin.pandas``, but it does not inherit the same
+pitfalls and design decisions that make it difficult to scale. This page will discuss
+how Modin's dataframe implementation differs from pandas, and how Modin scales pandas.
+
+Scalablity of implementation
+----------------------------
+
+The pandas implementation is inherently single-threaded. This means that only one of
+your CPU cores can be utilized at any given time. In a laptop, it would look something
+like this with pandas:
+
+.. image:: /img/pandas_multicore.png
+ :alt: pandas is single threaded!
+ :align: center
+ :scale: 80%
+
+However, Modin's implementation enables you to use all of the cores on your machine, or
+all of the cores in an entire cluster. On a laptop, it will look something like this:
+
+.. image:: /img/modin_multicore.png
+ :alt: modin uses all of the cores!
+ :align: center
+ :scale: 80%
+
+The additional utilization leads to improved performance, however if you want to scale
+to an entire cluster, Modin suddenly looks something like this:
+
+.. image:: /img/modin_cluster.png
+ :alt: modin works on a cluster too!
+ :align: center
+ :scale: 30%
+
+Modin is able to efficiently make use of all of the hardware available to it!
+
+Memory usage and immutability
+-----------------------------
+
+The pandas API contains many cases of "inplace" updates, which are known to be
+controversial. This is due in part to the way pandas manages memory: the user may
+think they are saving memory, but pandas is usually copying the data whether an
+operation was inplace or not.
+
+Modin allows for inplace semantics, but the underlying data structures within Modin's
+implementation are immutable, unlike pandas. This immutability gives Modin the ability
+to internally chain operators and better manage memory layouts, because they will not
+be changed. This leads to improvements over pandas in memory usage in many common cases,
+due to the ability to share common memory blocks among all dataframes.
+
+Modin provides the inplace semantics by having a mutable pointer to the immutable
+internal Modin dataframe. This pointer can change, but the underlying data cannot, so
+when an inplace update is triggered, Modin will treat it as if it were not inplace and
+just update the pointer to the resulting Modin dataframe.
+
+API vs implementation
+---------------------
+
+It is well known that the pandas API contains many duplicate ways of performing the same
+operation. Modin instead enforces that any one behavior have one and only one
+implementation internally. This guarantee enables Modin to focus on and optimize a
+smaller code footprint while still guaranteeing that it covers the entire pandas API.
+Modin has an internal algebra, which is roughly 15 operators, narrowed down from the
+original >200 that exist in pandas. The algebra is grounded in both practical and
+theoretical work. Learn more in our `VLDB 2020 paper`_. More information about this
+algebra can be found in the :doc:`../developer/architecture` documentation.
+
+.. _VLDB 2020 paper: https://arxiv.org/abs/2001.00888
diff --git a/docs/comparisons/spark.rst b/docs/comparisons/spark.rst
new file mode 100644
index 00000000000..bf60963f710
--- /dev/null
+++ b/docs/comparisons/spark.rst
@@ -0,0 +1,4 @@
+Modin vs. Koalas and Spark
+==========================
+
+Coming Soon...
diff --git a/docs/conf.py b/docs/conf.py
index 5feac65284c..a55c81fa271 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -10,7 +10,7 @@
import modin
project = u"Modin"
-copyright = u"2018-2020, Modin"
+copyright = u"2018-2021, Modin"
author = u"Modin contributors"
# The short X.Y version
@@ -30,6 +30,7 @@
# ones.
extensions = [
"sphinx.ext.autodoc",
+ "sphinx.ext.napoleon",
"sphinx.ext.intersphinx",
'sphinx.ext.todo',
'sphinx.ext.mathjax',
diff --git a/docs/developer/contributing.rst b/docs/contributing.rst
similarity index 60%
rename from docs/developer/contributing.rst
rename to docs/contributing.rst
index 257e6ffb9d5..51a367e7da0 100644
--- a/docs/developer/contributing.rst
+++ b/docs/contributing.rst
@@ -8,8 +8,8 @@ If you're interested in getting involved in the development of Modin, but aren't
where start, take a look at the issues tagged `Good first issue`_ or Documentation_.
These are issues that would be good for getting familiar with the codebase and better
understanding some of the more complex components of the architecture. There is
-documentation here about the architecture_ that you will want to review in order to get
-started.
+documentation here about the :doc:`architecture ` that you will
+want to review in order to get started.
Also, feel free to join the discussions on the `developer mailing list`_.
@@ -49,7 +49,6 @@ with this project or the open source license(s) involved."
Signed-off-by: Awesome Developer
-.
Code without a proper signoff cannot be merged into the
master branch. Note: You must use your real name (sorry, no pseudonyms or anonymous
contributions.)
@@ -82,6 +81,30 @@ commits and push them to GitHub.
If you've pushed your changes to GitHub already you'll need to force push your branch
after this with ``git push -f``.
+Commit Message formatting
+-------------------------
+To ensure that all commit messages in the master branch follow a specific format, we
+enforce that all commit messages must follow the following format:
+
+.. code-block:: bash
+
+ FEAT-#9999: Add `DataFrame.rolling` functionality, to enable rolling window operations
+
+The ``FEAT`` component represents the type of commit. This component of the commit
+message can be one of the following:
+
+* FEAT: A new feature that is added
+* DOCS: Documentation improvements or updates
+* FIX: A bugfix contribution
+* REFACTOR: Moving or removing code without change in functionality
+* TEST: Test updates or improvements
+
+The ``#9999`` component of the commit message should be the issue number in the Modin
+GitHub issue tracker: https://github.com/modin-project/modin/issues. This is important
+because it links commits to their issues.
+
+The commit message should follow a colon (:) and be descriptive and succinct.
+
Development Dependencies
------------------------
@@ -91,9 +114,9 @@ dependencies for running the tests and formatting the code:
.. code-block:: bash
- pip install -r requirements.txt
-
-For developments under Windows, dependencies can be found in 'env_windows.yml' file.
+ conda env create --file environment-dev.yml
+ # or
+ pip install -r requirements-dev.txt
Code Formatting and Lint
------------------------
@@ -106,13 +129,13 @@ that you run the following from the project root:
black modin/
We also use flake8_ to check linting errors. Running the following from the project root
-will ensure that it passes the lint checks on Travis:
+will ensure that it passes the lint checks on Github Actions:
.. code-block:: bash
flake8 .
-We test that this has been run on our `Travis CI`_ test suite. If you do this and find
+We test that this has been run on our `Github Actions`_ test suite. If you do this and find
that the tests are still failing, try updating your version of black and flake8.
Adding a test
@@ -120,7 +143,7 @@ Adding a test
If you find yourself fixing a bug or adding a new feature, don't forget to add a test to
the test suite to verify its correctness! More on testing and the layout of the tests
-can be found in our testing_ documentation. We ask that you follow the existing
+can be found in our testing documentation. We ask that you follow the existing
structure of the tests for ease of maintenance.
Running the tests
@@ -142,11 +165,84 @@ subset of the test suite. In order to run a specific test run:
The entire test suite is automatically run for each pull request.
+Performance measurement
+-----------------------
+
+We use Asv_ tool for performance tracking of various Modin functionality.
+
+Here are some scenarios in which Asv can be used:
+
+* It is necessary to check the impact of the new patch on the performance of a certain set of operations:
+
+.. code-block:: bash
+
+ asv continuous -f 1.05 src/master HEAD -b TimeGroupBy --launch-method=spawn
+
+* It is necessary to check presence of errors inside of benchmarks after making changes or writing new ones:
+
+.. code-block:: bash
+
+ asv run --quick --show-stderr --python=same --launch-method=spawn
+
+* You just need to run the entire test suite to get the current time numbers:
+
+.. code-block:: bash
+
+ asv run --launch-method=spawn
+
+* It is necessary to check the range of commits for performance degradation:
+
+.. code-block:: bash
+
+ asv run [start_hash]..[end_hash] --launch-method=spawn
+ asv publish
+ asv preview
+
+For more consistent results, you may need to use the following parameters:
+
+* ``-a sample_time=1``
+* ``-a warmup_time=1``
+* ``-a processes=4``
+
+Some details about using Modin on Ray with Asv:
+
+* ``--launch-method=forkserver`` is not working
+* Each set of parameters for each test is launched in its own process, which brings
+ a large overhead, since for each process redis server and other necessary binaries
+ from ray initialization are started and destroyed.
+
+Some details for maintenance:
+
+* ``modin/asv_bench/asv.conf.json`` contains the modin dependencies, with which testing takes place.
+
+We need to keep them up to date according to the dependencies in ``setup.py``.
+
+
+Building documentation
+----------------------
+
+To build the documentation, please follow the steps below from the project root:
+
+.. code-block:: bash
+
+ cd docs
+ pip install -r requirements-doc.txt
+ sphinx-build -b html . build
+
+To visualize the documentation locally, run the following from `build` folder:
+
+.. code-block:: bash
+
+ python -m http.server
+ # python -m http.server 1234
+
+then open the browser at `0.0.0.0:` (e.g. `0.0.0.0:1234`).
+
Contributing a new execution framework or in-memory format
----------------------------------------------------------
If you are interested in contributing support for a new execution framework or in-memory
-format, please make sure you understand the architecture_ of Modin.
+format, please make sure you understand the :doc:`architecture ` of Modin.
The best place to start the discussion for adding a new execution framework or in-memory
format is the `developer mailing list`_.
@@ -155,10 +251,8 @@ More docs on this coming soon...
.. _Good first issue: https://github.com/modin-project/modin/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue+%3Abeginner%3A%22
.. _Documentation: https://github.com/modin-project/modin/issues?q=is%3Aissue+is%3Aopen+label%3A%22documentation+%3Abookmark_tabs%3A%22
-.. _architecture: architecture.html
-.. _internal methods:
.. _black: https://github.com/ambv/black
.. _flake8: http://flake8.pycqa.org/en/latest/
-.. _Travis CI: https://travis-ci.org/
-.. _testing:
+.. _Github Actions: https://github.com/features/actions
+.. _Asv: https://github.com/airspeed-velocity/asv#airspeed-velocity
.. _developer mailing list: https://groups.google.com/forum/#!forum/modin-dev
diff --git a/docs/developer/architecture.rst b/docs/developer/architecture.rst
index 934e9781053..8305e460391 100644
--- a/docs/developer/architecture.rst
+++ b/docs/developer/architecture.rst
@@ -234,27 +234,23 @@ Supported Execution Frameworks and Memory Formats
This is the list of execution frameworks and memory formats supported in Modin. If you
would like to contribute a new execution framework or memory format, please see the
-documentation page on Contributing_.
+documentation page on :doc:`contributing `.
-- `Pandas on Ray`_
+- :doc:`Pandas on Ray `
- Uses the Ray_ execution framework.
- The compute kernel/in-memory format is a pandas DataFrame.
-- `Pandas on Dask`_
+- :doc:`Pandas on Dask `
- Uses the `Dask Futures`_ execution framework.
- The compute kernel/in-memory format is a pandas DataFrame.
-- `Pyarrow on Ray`_ (experimental)
+- :doc:`Pyarrow on Ray ` (experimental)
- Uses the Ray_ execution framework.
- The compute kernel/in-memory format is a pyarrow Table.
-.. _pandas Dataframe: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html
+.. _pandas Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
.. _Arrow tables: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html
.. _Ray: https://github.com/ray-project/ray
.. _code: https://github.com/modin-project/modin/blob/master/modin/engines/base/frame/data.py
-.. _Contributing: contributing.html
-.. _Pandas on Ray: UsingPandasonRay/optimizations.html
-.. _Pandas on Dask: UsingPandasonDask/optimizations.html
.. _Dask Futures: https://docs.dask.org/en/latest/futures.html
.. _issue: https://github.com/modin-project/modin/issues
.. _Discourse: https://discuss.modin.org
.. _task parallel: https://en.wikipedia.org/wiki/Task_parallelism
-.. _Pyarrow on Ray: UsingPyarrowonRay/index.html
diff --git a/docs/examples/index.rst b/docs/examples/index.rst
new file mode 100644
index 00000000000..e7aba021fa3
--- /dev/null
+++ b/docs/examples/index.rst
@@ -0,0 +1,9 @@
+Examples
+========
+
+scikit-learn with LinearRegression
+----------------------------------
+Here is a Jupyter Notebook example which uses Modin with scikit-learn
+and linear regression `sklearn LinearRegression`_.
+
+.. _sklearn LinearRegression: https://github.com/modin-project/modin/blob/master/examples/modin-scikit-learn-example.ipynb
diff --git a/docs/img/modin_cluster.png b/docs/img/modin_cluster.png
new file mode 100644
index 00000000000..7bfb190b072
Binary files /dev/null and b/docs/img/modin_cluster.png differ
diff --git a/docs/img/modin_multicore.png b/docs/img/modin_multicore.png
new file mode 100644
index 00000000000..9dcd0bbfdf2
Binary files /dev/null and b/docs/img/modin_multicore.png differ
diff --git a/docs/img/pandas_multicore.png b/docs/img/pandas_multicore.png
new file mode 100644
index 00000000000..a56c4279848
Binary files /dev/null and b/docs/img/pandas_multicore.png differ
diff --git a/docs/index.rst b/docs/index.rst
index 2f8e80822a5..7afe3b1c78f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -127,6 +127,20 @@ nature, you get a fast DataFrame at 1MB and 1TB+.
using_modin
out_of_core
+ modin_xgboost
+
+.. toctree::
+ :caption: Examples
+
+ examples/index
+
+.. toctree::
+ :caption: How is Modin different from ...?
+
+ comparisons/index
+ comparisons/pandas
+ comparisons/dask
+ comparisons/spark
.. toctree::
:caption: Supported APIs
@@ -140,7 +154,7 @@ nature, you get a fast DataFrame at 1MB and 1TB+.
.. toctree::
:caption: Developer Documentation
- developer/contributing
+ contributing
developer/architecture
.. toctree::
@@ -151,18 +165,12 @@ nature, you get a fast DataFrame at 1MB and 1TB+.
UsingPyarrowonRay/index
UsingSQLonRay/index
-.. toctree::
- :caption: Contributing to Modin
-
- contributing
- architecture
-
.. toctree::
:caption: Help
troubleshooting
contact
-.. _Dataframe: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html
+.. _Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
.. _Ray: https://github.com/ray-project/ray/
.. _Dask: https://dask.org/
diff --git a/docs/installation.rst b/docs/installation.rst
index f39d51c85ee..a89783fb930 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -84,7 +84,7 @@ You may already have a recent version of Dask_ installed, in which case you can
Building Modin from Source
--------------------------
-If you're planning on contributing_ to Modin, you will need to ensure that you are
+If you're planning on :doc:`contributing ` to Modin, you will need to ensure that you are
building Modin from the local repository that you are working off of. Occasionally,
there are issues in overlapping Modin installs from pypi and from source. To avoid these
issues, we recommend uninstalling Modin before you install from source:
@@ -109,8 +109,6 @@ Once cloned, ``cd`` into the ``modin`` directory and use ``pip`` to install:
.. _`GitHub repo`: https://github.com/modin-project/modin/tree/master
.. _issue: https://github.com/modin-project/modin/issues
-.. _`out of core`: out_of_core.html
.. _WSL: https://docs.microsoft.com/en-us/windows/wsl/install-win10
.. _Ray: http://ray.readthedocs.io
-.. _contributing: contributing.html
.. _Dask: https://github.com/dask/dask
diff --git a/docs/modin_vs_dask.md b/docs/modin_vs_dask.md
new file mode 100644
index 00000000000..477dba9a887
--- /dev/null
+++ b/docs/modin_vs_dask.md
@@ -0,0 +1,32 @@
+# What is the difference between Dask DataFrame and Modin?
+
+**The TL;DR is that Modin's API is identical to pandas, whereas Dask's is not. Note: The projects are fundamentally different in their aims, so a fair comparison is challenging.**
+
+## API
+
+### Dask DataFrame
+
+Dask DataFrame does not scale the entire pandas API, and it isn't trying to. See this explained in their documentation [here](http://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses)
+
+Dask DataFrames API is also different from the pandas API in that it is lazy and needs .compute() to materialize the DataFrame. This makes the API less convenient but allows to do certain query optimizations/rearrangement, which can give speedups in certain situations. We are planning to incorporate similar capabilities into Modin but hope we can do so without having to change the API. We will outline plans for speeding up Modin in an upcoming blog post.
+
+### Modin
+
+Modin attempts to parallelize as much of the pandas API as is possible. We have worked through a significant portion of the DataFrame API. It is intended to be used as a drop-in replacement for pandas, such that even if the API is not yet parallelized, it is still defaulting to pandas.
+
+## Architecture
+
+### Dask DataFrame
+
+Dask DataFrame has row-based partitioning, similar to Spark. This can be seen in their [documentation](http://docs.dask.org/en/latest/dataframe.html#design.) They also have a custom index object for indexing into the object, which is not pandas compatible. Dask DataFrame seems to treat operations on the DataFrame as MapReduce operations, which is a good paradigm for the subset of the pandas API they have chosen to implement.
+
+### Modin
+
+Modin is more of a column-store, which we inherited from modern database systems. We laterally partition the columns for scalability (many systems, such as Google BigTable already did this), so we can scale in both directions and have finer grained partitioning. This is explained at a high level in [Modin's documentation](https://modin.readthedocs.io/en/latest/architecture.html). Because we have this finer grained control over the partitioning, we can support a number of operations that are very challenging in MapReduce systems (e.g. transpose, median, quantile).
+
+## Modin aims
+
+In the long-term, Modin is planned to become a DataFrame library that supports the popular APIs (SQL, pandas, etc.) and runs on a variety of compute engines and backends. In fact, a group was able to contribute a dask.delayed backend to Modin already in <200 lines of code [PR](https://github.com/modin-project/modin/pull/281).
+
+
+- Reference: [Query: What is the difference between Dask and Modin? #515](https://github.com/modin-project/modin/issues/515)
\ No newline at end of file
diff --git a/docs/modin_xgboost.rst b/docs/modin_xgboost.rst
new file mode 100644
index 00000000000..c52b3e0830a
--- /dev/null
+++ b/docs/modin_xgboost.rst
@@ -0,0 +1,146 @@
+Distributed XGBoost on Modin (experimental)
+===========================================
+
+Modin provides an implementation of distributed XGBoost machine learning
+algorithm on Modin DataFrames. Please note that this feature is experimental and behavior or
+interfaces could be changed.
+
+Install XGBoost on Modin
+------------------------
+
+Modin comes with all the dependencies except ``xgboost`` package by default.
+Currently, distributed XGBoost on Modin is only supported on the Ray backend, therefore, see
+the :doc:`installation page ` for more information on installing Modin with the Ray backend.
+To install ``xgboost`` package you can use ``pip``:
+
+.. code-block:: bash
+
+ pip install xgboost
+
+
+XGBoost Train and Predict
+-------------------------
+
+Distributed XGBoost functionality is placed in ``modin.experimental.xgboost`` module.
+``modin.experimental.xgboost`` provides a xgboost-like API for ``train`` and ``predict`` functions.
+
+.. automodule:: modin.experimental.xgboost
+ :members: train
+
+``train`` has all arguments of the ``xgboost.train`` function except for ``evals_result``
+parameter which is returned as part of function return value instead of argument.
+
+.. automodule:: modin.experimental.xgboost
+ :noindex:
+ :members: predict
+
+``predict`` is similar to ``xgboost.Booster.predict`` with an additional argument,
+``model``.
+
+
+ModinDMatrix
+------------
+
+Data is passed to ``modin.experimental.xgboost`` functions via a ``ModinDMatrix`` object.
+
+.. automodule:: modin.experimental.xgboost
+ :noindex:
+ :members: ModinDMatrix
+
+Currently, the ``ModinDMatrix`` supports ``modin.pandas.DataFrame`` only as an input.
+
+
+A Single Node / Cluster setup
+-----------------------------
+
+The XGBoost part of Modin uses a Ray resources by similar way as all Modin functions.
+
+To start the Ray runtime on a single node:
+
+.. code-block:: python
+
+ import ray
+ ray.init()
+
+If you already had the Ray cluster you can connect to it by next way:
+
+.. code-block:: python
+
+ import ray
+ ray.init(address='auto')
+
+A detailed information about initializing the Ray runtime you can find in `starting ray`_ page.
+
+
+Usage example
+-------------
+
+In example below we train XGBoost model using `the Iris Dataset`_ and get prediction on the same data.
+All processing will be in a `single node` mode.
+
+.. code-block:: python
+
+ from sklearn import datasets
+
+ import ray
+ ray.init() # Start the Ray runtime for single-node
+
+ import modin.pandas as pd
+ import modin.experimental.xgboost as xgb
+
+ # Load iris dataset from sklearn
+ iris = datasets.load_iris()
+
+ # Create Modin DataFrames
+ X = pd.DataFrame(iris.data)
+ y = pd.DataFrame(iris.target)
+
+ # Create ModinDMatrix
+ dtrain = xgb.ModinDMatrix(X, y)
+ dtest = xgb.ModinDMatrix(X, y)
+
+ # Set training parameters
+ xgb_params = {
+ "eta": 0.3,
+ "max_depth": 3,
+ "objective": "multi:softprob",
+ "num_class": 3,
+ "eval_metric": "mlogloss",
+ }
+ steps = 20
+
+ # Run training
+ model = xgb.train(
+ xgb_params,
+ dtrain,
+ steps,
+ evals=[(dtrain, "train")]
+ )
+
+ # Save for some usage
+ evals_result = model["history"]
+ booster = model["booster"]
+
+ # Predict results
+ prediction = xgb.predict(model, dtest)
+
+
+Modes of a data distribution
+----------------------------
+
+Modin XGBoost provides two approaches for an internal data ditribution which could be
+switched by `evenly_data_distribution` parameter of ``train/predict`` functions:
+
+* ``evenly_data_distribution = True``: in this case the input data of ``train/predict``
+ functions will be distributed evenly between nodes in a cluster to ensure evenly utilization of nodes (default behavior).
+
+* ``evenly_data_distribution = False`` : in this case partitions of input data of ``train/predict``
+ functions will not transfer between nodes in cluster in case empty nodes is <10%,
+ if portion of empty nodes is ≥10% evenly data distribution will be applied.
+ This method provides minimal data transfers between nodes but doesn't guarantee effective utilization of nodes.
+ Most effective in case when all cluster nodes are occupied by data.
+
+
+.. _Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
+.. _`starting ray`: https://docs.ray.io/en/master/starting-ray.html
+.. _`the Iris Dataset`: https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
diff --git a/docs/out_of_core.rst b/docs/out_of_core.rst
index 7e7fb862bf9..a7373388c7d 100644
--- a/docs/out_of_core.rst
+++ b/docs/out_of_core.rst
@@ -10,7 +10,7 @@ Install Modin out of core
-------------------------
Modin now comes with all the dependencies for out of core functionality by default! See
-the `installation page`_ for more information on installing Modin.
+the :doc:`installation page ` for more information on installing Modin.
Starting Modin with out of core enabled
---------------------------------------
@@ -62,5 +62,4 @@ This example creates a 40GB DataFrame from 20 identical 2GB DataFrames and perfo
various operations on them. Feel free to play around with this code and let us know what
you think!
-.. _Dataframe: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html
-.. _`installation page`: installation.html
+.. _Dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt
index 39b4d3724ba..10d2be6b342 100644
--- a/docs/requirements-doc.txt
+++ b/docs/requirements-doc.txt
@@ -3,14 +3,13 @@ click
flatbuffers
funcsigs
mock
-numpy
opencv-python
pyyaml
recommonmark
sphinx
sphinx-click
sphinx_rtd_theme
-pandas
-modin[all]
+git+https://github.com/modin-project/modin.git@master#egg=modin[all]
sphinxcontrib_plantuml
sphinx-issues
+xgboost
diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst
index 46d8bcb74cb..d2137bde5a3 100644
--- a/docs/supported_apis/dataframe_supported.rst
+++ b/docs/supported_apis/dataframe_supported.rst
@@ -79,7 +79,7 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``combine_first`` | `combine_first`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
-| ``compare`` | `compare`_ | D | |
+| ``compare`` | `compare`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``copy`` | `copy`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
diff --git a/docs/supported_apis/index.rst b/docs/supported_apis/index.rst
index 055119a148c..b70f533c007 100644
--- a/docs/supported_apis/index.rst
+++ b/docs/supported_apis/index.rst
@@ -5,7 +5,7 @@ For your convenience, we have compiled a list of currently implemented APIs and
available in Modin. This documentation is updated as new methods and APIs are merged
into the master branch, and not necessarily correct as of the most recent release. In
order to install the latest version of Modin, follow the directions found on the
-`installation page`_.
+:doc:`installation page `.
Questions on implementation details
-----------------------------------
@@ -33,20 +33,15 @@ Modin.
The exact methods we have implemented are listed in the respective subsections:
-* DataFrame_
-* Series_
-* utilities_
-* `I/O`_
+* :doc:`DataFrame `
+* :doc:`Series `
+* :doc:`utilities `
+* :doc:`I/O `
We have taken a community-driven approach to implementing new methods. We did a `study
on pandas usage`_ to learn what the most-used APIs are. Modin currently supports **93%**
of the pandas API based on our study of pandas usage, and we are actively expanding the
API.
-.. _DataFrame: dataframe_supported.html
-.. _Series: series_supported.html
-.. _utilities: utilities_supported.html
-.. _I/O: io_supported.html
-.. _study on pandas usage: https://github.com/modin-project/study_kaggle_usage
.. _`developer mailing list`: https://groups.google.com/forum/#!forum/modin-dev
-.. _`installation page`: installation.html#building-modin-from-source
+.. _`study on pandas usage`: https://github.com/modin-project/study_kaggle_usage
diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst
index a6b31d05a28..d45a51b7461 100644
--- a/docs/supported_apis/series_supported.rst
+++ b/docs/supported_apis/series_supported.rst
@@ -11,7 +11,7 @@ The second column is a flag for whether or not there is an implementation in Mod
the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` stands
for partial (meaning some parameters may not be supported yet), and ``D`` stands for
default to pandas. To learn more about the implementations that default to pandas, see
-the related section on `Defaulting to pandas`_.
+the related section on :doc:`Defaulting to pandas `.
+-----------------------------+---------------------------------+----------------------------------------------------+
| Series method | Modin Implementation? (Y/N/P/D) | Notes for Current implementation |
@@ -90,7 +90,7 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``combine_first`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
-| ``compare`` | D | |
+| ``compare`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``compress`` | D | |
+-----------------------------+---------------------------------+----------------------------------------------------+
@@ -474,10 +474,8 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``valid`` | D | |
+-----------------------------+---------------------------------+----------------------------------------------------+
-| ``value_counts`` | Y | The indices of resulting object will be in |
-| | | descending (ascending, if ascending=True) order for|
-| | | equal values. |
-| | | In pandas indices are located in random order. |
+| ``value_counts`` | Y | The indices order of resulting object may differ |
+| | | from pandas. |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``values`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
@@ -489,4 +487,3 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+----------------------------------------------------+
.. _`GitHub repository`: https://github.com/modin-project/modin/issues
-.. _`Defaulting to pandas`: index.html
diff --git a/docs/supported_apis/utilities_supported.rst b/docs/supported_apis/utilities_supported.rst
index 89e5e66a79a..f928c3600a8 100644
--- a/docs/supported_apis/utilities_supported.rst
+++ b/docs/supported_apis/utilities_supported.rst
@@ -21,10 +21,8 @@ default to pandas.
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.unique`_ | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
-| ``pd.value_counts`` | Y | The indices of resulting object will be in |
-| | | descending (ascending, if ascending=True) order for|
-| | | equal values. |
-| | | In pandas indices are located in random order. |
+| ``pd.value_counts`` | Y | The indices order of resulting object may differ |
+| | | from pandas. |
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.cut`_ | D | |
+---------------------------+---------------------------------+----------------------------------------------------+
diff --git a/docs/using_modin.rst b/docs/using_modin.rst
index 4e5a95f802e..8e91697e3c1 100644
--- a/docs/using_modin.rst
+++ b/docs/using_modin.rst
@@ -21,8 +21,10 @@ functional parity with pandas.
Using Modin on a Single Node
----------------------------
+**In local (without a cluster) modin will create and manage a local (dask or ray) cluster for the execution**
+
In order to use the most up-to-date version of Modin, please follow the instructions on
-the `installation page`_
+the :doc:`installation page `.
Once you import the library, you should see something similar to the following output:
@@ -50,7 +52,7 @@ Please note, the API is not yet complete. For some methods, you may see the foll
NotImplementedError: To contribute to Modin, please visit github.com/modin-project/modin.
-We have compiled a list of `currently supported methods`_.
+We have compiled a list of :doc:`currently supported methods `.
If you would like to request a particular method be implemented, feel free to `open an
issue`_. Before you open an issue please make sure that someone else has not already
@@ -88,7 +90,7 @@ you can customize your Ray environment for use in Modin!
Exceeding memory (Out of core pandas)
"""""""""""""""""""""""""""""""""""""
-Modin experimentally supports out of core operations. See more on the `Out of Core`_
+Modin experimentally supports out of core operations. See more on the :doc:`Out of Core `
documentation page.
Reducing or limiting the resources Modin can use
@@ -132,13 +134,10 @@ Examples
You can find an example on our recent `blog post`_ or on the `Jupyter Notebook`_ that we
used to create the blog post.
-.. _`DataFrame`: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html
+.. _`DataFrame`: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
.. _`pandas`: https://pandas.pydata.org/pandas-docs/stable/
-.. _`installation page`: https://modin.readthedocs.io/en/latest/installation.html
-.. _`currently supported methods`: https://modin.readthedocs.io/en/latest/pandas_supported.html
.. _`open an issue`: https://github.com/modin-project/modin/issues
.. _`autoscaler documentation`: https://ray.readthedocs.io/en/latest/autoscaling.html
.. _`Ray's documentation`: https://ray.readthedocs.io/en/latest/api.html
.. _`blog post`: https://rise.cs.berkeley.edu/blog/pandas-on-ray-early-lessons/
.. _`Jupyter Notebook`: https://gist.github.com/devin-petersohn/f424d9fb5579a96507c709a36d487f24#file-pandas_on_ray_blog_post_0-ipynb
-.. _`Out of Core`: out_of_core.html
diff --git a/environment.yml b/environment-dev.yml
similarity index 79%
rename from environment.yml
rename to environment-dev.yml
index d50ca19a1f8..bd05c5975fa 100644
--- a/environment.yml
+++ b/environment-dev.yml
@@ -2,9 +2,9 @@ name: modin
channels:
- conda-forge
dependencies:
- - pandas==1.1.3
- - numpy
- - pyarrow<0.17
+ - pandas==1.2.1
+ - numpy>=1.16.5,<1.20 # pandas gh-39513
+ - pyarrow>=1.0.0
- dask[complete]>=2.12.0,<=2.19.0
- distributed>=2.12.0,<=2.19.0
- xarray
@@ -32,5 +32,7 @@ dependencies:
- rpyc==4.1.5
- cloudpickle==1.4.1
- boto3
+ - asv
+ - ray-core >=1.0.0
- pip:
- - ray>=1.0.0
+ - xgboost >=1.3
diff --git a/examples/docker/census-on-omnisci/build-docker-image.sh b/examples/docker/census-on-omnisci/build-docker-image.sh
new file mode 100644
index 00000000000..f4dcb266365
--- /dev/null
+++ b/examples/docker/census-on-omnisci/build-docker-image.sh
@@ -0,0 +1,25 @@
+#!/bin/bash -e
+
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+echo "Note: a user is responsible for preparing the dataset.
+The dataset must be named as 'ipums_education2income_1970-2010.csv' and
+be in the folder with 'census-omnisci.dockerfile'. It can be downloaded by link:
+'https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz'"
+
+cd "`dirname \"$0\"`"
+
+docker build -f census-omnisci.dockerfile -t census-omnisci --build-arg no_proxy \
+ --build-arg https_proxy --build-arg http_proxy --build-arg conda_extra_channel .
+printf "\n\nTo run the benchmark execute:\n\tdocker run --rm census-omnisci\n"
diff --git a/examples/docker/census-on-omnisci/census-omnisci.dockerfile b/examples/docker/census-on-omnisci/census-omnisci.dockerfile
new file mode 100644
index 00000000000..dd41e981053
--- /dev/null
+++ b/examples/docker/census-on-omnisci/census-omnisci.dockerfile
@@ -0,0 +1,63 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+FROM ubuntu:18.04
+ENV http_proxy ${http_proxy}
+ENV https_proxy ${https_proxy}
+ENV no_proxy ${no_proxy}
+ENV MODIN_BACKEND "omnisci"
+ENV MODIN_EXPERIMENTAL "true"
+
+ARG conda_extra_channel
+ENV add_extra_channel=${conda_extra_channel:+"-c ${conda_extra_channel}"}
+
+RUN apt-get update --yes && apt-get upgrade --yes \
+ && apt-get install wget --yes && \
+ rm -rf /var/lib/apt/lists/*
+
+ENV USER modin
+ENV UID 1000
+ENV HOME /home/$USER
+
+RUN adduser --disabled-password \
+ --gecos "Non-root user" \
+ --uid $UID \
+ --home $HOME \
+ $USER
+
+ENV CONDA_DIR ${HOME}/miniconda
+
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
+ bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
+ "${CONDA_DIR}/bin/conda" init bash && \
+ rm -f /tmp/miniconda3.sh && \
+ echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
+
+RUN conda update -n base -c defaults conda -y && \
+ conda create -n modin --yes --no-default-packages && \
+ conda activate modin && \
+ conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" "numpy<1.20.0"
+
+RUN conda activate modin && \
+ conda install -c intel/label/modin -c conda-forge -c intel ${add_extra_channel} \
+ "daal4py>=2021.1" dpcpp_cpp_rt && \
+ conda install -c conda-forge "scikit-learn<0.24.0" && \
+ conda clean --all --yes
+
+COPY ipums_education2income_1970-2010.csv "${HOME}/ipums_education2income_1970-2010.csv"
+
+COPY census-omnisci.py "${HOME}/census-omnisci.py"
+
+CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/census-omnisci.py"]
diff --git a/examples/docker/census-on-omnisci/census-omnisci.py b/examples/docker/census-on-omnisci/census-omnisci.py
new file mode 100644
index 00000000000..48e946870b8
--- /dev/null
+++ b/examples/docker/census-on-omnisci/census-omnisci.py
@@ -0,0 +1,162 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import os
+import time
+import modin.pandas as pd
+from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer
+
+from sklearn import config_context
+import daal4py.sklearn as sklearn
+
+sklearn.patch_sklearn()
+from sklearn.model_selection import train_test_split
+import sklearn.linear_model as lm
+import numpy as np
+
+
+def read():
+ columns_names = [
+ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX",
+ "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2",
+ "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP",
+ "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2",
+ "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2",
+ ]
+ columns_types = [
+ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64",
+ "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64",
+ "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64",
+ "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64",
+ "float64", "float64", "float64", "float64", "float64", "float64", "float64",
+ ]
+ dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))}
+
+ df = pd.read_csv(
+ os.path.expanduser('~/ipums_education2income_1970-2010.csv'),
+ names=columns_names,
+ dtype=dtypes,
+ skiprows=1,
+ )
+
+ df.shape # to trigger real execution
+ df._query_compiler._modin_frame._partitions[0][
+ 0
+ ].frame_id = OmnisciServer().put_arrow_to_omnisci(
+ df._query_compiler._modin_frame._partitions[0][0].get()
+ ) # to trigger real execution
+ return df
+
+
+def etl(df):
+ keep_cols = [
+ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE",
+ "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2",
+ "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD",
+ ]
+ df = df[keep_cols]
+
+ df = df[df["INCTOT"] != 9999999]
+ df = df[df["EDUC"] != -1]
+ df = df[df["EDUCD"] != -1]
+
+ df["INCTOT"] = df["INCTOT"] * df["CPI99"]
+
+ for column in keep_cols:
+ df[column] = df[column].fillna(-1)
+
+ df[column] = df[column].astype("float64")
+
+ y = df["EDUC"]
+ X = df.drop(columns=["EDUC", "CPI99"])
+
+ # to trigger real execution
+ df.shape
+ y.shape
+ X.shape
+
+ return (df, X, y)
+
+
+def mse(y_test, y_pred):
+ return ((y_test - y_pred) ** 2).mean()
+
+
+def cod(y_test, y_pred):
+ y_bar = y_test.mean()
+ total = ((y_test - y_bar) ** 2).sum()
+ residuals = ((y_test - y_pred) ** 2).sum()
+ return 1 - (residuals / total)
+
+
+def ml(X, y, random_state, n_runs, test_size):
+ clf = lm.Ridge()
+
+ X = np.ascontiguousarray(X, dtype=np.float64)
+ y = np.ascontiguousarray(y, dtype=np.float64)
+
+ mse_values, cod_values = [], []
+ ml_scores = {}
+
+ print("ML runs: ", n_runs)
+ for i in range(n_runs):
+ (X_train, X_test, y_train, y_test) = train_test_split(
+ X, y, test_size=test_size, random_state=random_state
+ )
+ random_state += 777
+
+ with config_context(assume_finite=True):
+ model = clf.fit(X_train, y_train)
+
+ y_pred = model.predict(X_test)
+
+ mse_values.append(mse(y_test, y_pred))
+ cod_values.append(cod(y_test, y_pred))
+
+ ml_scores["mse_mean"] = sum(mse_values) / len(mse_values)
+ ml_scores["cod_mean"] = sum(cod_values) / len(cod_values)
+ ml_scores["mse_dev"] = pow(
+ sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values])
+ / (len(mse_values) - 1),
+ 0.5,
+ )
+ ml_scores["cod_dev"] = pow(
+ sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values])
+ / (len(cod_values) - 1),
+ 0.5,
+ )
+
+ return ml_scores
+
+
+def measure(name, func, *args, **kw):
+ t0 = time.time()
+ res = func(*args, **kw)
+ t1 = time.time()
+ print(f'{name}: {t1 - t0} sec')
+ return res
+
+
+def main():
+ # ML specific
+ N_RUNS = 50
+ TEST_SIZE = 0.1
+ RANDOM_STATE = 777
+
+ df = measure('Reading', read)
+ _, X, y = measure('ETL', etl, df)
+ measure('ML', ml, X, y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/docker/nyc-taxi.dockerfile b/examples/docker/nyc-taxi.dockerfile
index f10e749a1f7..2d6c816cfa6 100644
--- a/examples/docker/nyc-taxi.dockerfile
+++ b/examples/docker/nyc-taxi.dockerfile
@@ -12,11 +12,16 @@
# governing permissions and limitations under the License.
FROM ubuntu:18.04
+
+ARG PYTHON_VERSION=3.7
ENV http_proxy ${http_proxy}
ENV https_proxy ${https_proxy}
-RUN apt-get update --yes \
- && apt-get install wget --yes && \
+RUN apt-get update --yes && \
+ apt-get install --yes --no-install-recommends --fix-missing \
+ gcc \
+ python${PYTHON_VERSION}-dev \
+ wget && \
rm -rf /var/lib/apt/lists/*
ENV USER modin
@@ -33,7 +38,7 @@ ENV CONDA_DIR ${HOME}/miniconda
SHELL ["/bin/bash", "--login", "-c"]
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
+RUN wget --quiet --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
"${CONDA_DIR}/bin/conda" init bash && \
rm -f /tmp/miniconda3.sh && \
@@ -45,9 +50,9 @@ RUN conda update -n base -c defaults conda -y && \
pip install --no-cache-dir modin[ray] && \
conda clean --all --yes
-RUN wget https://modin-datasets.s3.amazonaws.com/trips_data.csv -O "${HOME}/trips_data.csv"
+RUN wget --quiet --no-check-certificate https://modin-datasets.s3.amazonaws.com/trips_data.csv -O "${HOME}/trips_data.csv"
COPY nyc-taxi.py "${HOME}/nyc-taxi.py"
-ENTRYPOINT ["/bin/bash", "--login", "-c", "conda run \"$@\"", "/bin/bash", "-n", "modin", "/usr/bin/env", "--"]
+ENTRYPOINT ["/bin/bash", "--login", "-c", "http_proxy= https_proxy= conda run \"$@\"", "/bin/bash", "-n", "modin", "/usr/bin/env", "--"]
CMD ["python", "${HOME}/nyc-taxi.py"]
diff --git a/examples/docker/nyc-taxi.py b/examples/docker/nyc-taxi.py
index 43cb53e1852..753b857b52b 100644
--- a/examples/docker/nyc-taxi.py
+++ b/examples/docker/nyc-taxi.py
@@ -40,19 +40,14 @@ def q2(df):
return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]]
def q3(df):
- transformed = pd.DataFrame({
- "passenger_count": df["passenger_count"],
- "pickup_datetime": df["pickup_datetime"].dt.year,
- })
- return transformed.groupby(["pickup_datetime", "passenger_count"]).agg({"passenger_count": ["count"]})
+ df["pickup_datetime"] = df["pickup_datetime"].dt.year
+ return df.groupby(["pickup_datetime", "passenger_count"]).size().reset_index()
+
def q4(df):
- transformed = pd.DataFrame({
- "passenger_count": df["passenger_count"],
- "pickup_datetime": df["pickup_datetime"].dt.year,
- "trip_distance": df["trip_distance"].astype("int64"),
- })
- return transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \
+ df["pickup_datetime"] = df["pickup_datetime"].dt.year
+ df["trip_distance"] = df["trip_distance"].astype("int64")
+ return df.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \
.size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False])
def measure(name, func, *args, **kw):
@@ -66,8 +61,8 @@ def main():
df = measure('Reading', read)
measure('Q1', q1, df)
measure('Q2', q2, df)
- measure('Q3', q3, df)
- measure('Q4', q4, df)
+ measure('Q3', q3, df.copy())
+ measure('Q4', q4, df.copy())
if __name__ == '__main__':
main()
diff --git a/examples/docker/plasticc-on-omnisci/build-docker-image.sh b/examples/docker/plasticc-on-omnisci/build-docker-image.sh
new file mode 100644
index 00000000000..dcc0f018bb1
--- /dev/null
+++ b/examples/docker/plasticc-on-omnisci/build-docker-image.sh
@@ -0,0 +1,25 @@
+#!/bin/bash -e
+
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+echo "Note: a user is responsible for preparing the datasets.
+The datasets must be named as: 'training_set.csv', 'test_set_skiprows.csv',
+'training_set_metadata.csv', 'test_set_metadata.csv' and
+be in the folder with 'plasticc-omnisci.dockerfile'."
+
+cd "`dirname \"$0\"`"
+
+docker build -f plasticc-omnisci.dockerfile -t plasticc-omnisci --build-arg no_proxy \
+ --build-arg https_proxy --build-arg http_proxy --build-arg conda_extra_channel .
+printf "\n\nTo run the benchmark execute:\n\tdocker run --rm plasticc-omnisci\n"
diff --git a/examples/docker/plasticc-on-omnisci/plasticc-omnisci.dockerfile b/examples/docker/plasticc-on-omnisci/plasticc-omnisci.dockerfile
new file mode 100644
index 00000000000..8424a02d748
--- /dev/null
+++ b/examples/docker/plasticc-on-omnisci/plasticc-omnisci.dockerfile
@@ -0,0 +1,68 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+FROM ubuntu:18.04
+ENV http_proxy ${http_proxy}
+ENV https_proxy ${https_proxy}
+ENV no_proxy ${no_proxy}
+ENV MODIN_BACKEND "omnisci"
+ENV MODIN_EXPERIMENTAL "true"
+
+ARG conda_extra_channel
+ENV add_extra_channel=${conda_extra_channel:+"-c ${conda_extra_channel}"}
+
+RUN apt-get update --yes \
+ && apt-get install wget --yes && \
+ rm -rf /var/lib/apt/lists/*
+
+ENV USER modin
+ENV UID 1000
+ENV HOME /home/$USER
+
+RUN adduser --disabled-password \
+ --gecos "Non-root user" \
+ --uid $UID \
+ --home $HOME \
+ $USER
+
+ENV CONDA_DIR ${HOME}/miniconda
+
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
+ bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
+ "${CONDA_DIR}/bin/conda" init bash && \
+ rm -f /tmp/miniconda3.sh && \
+ echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
+
+RUN conda update -n base -c defaults conda -y && \
+ conda create -n modin --yes --no-default-packages && \
+ conda activate modin && \
+ conda install -c intel/label/modin -c conda-forge "numpy<1.20.0" omniscidbe4py "ray-core>=1.0" \
+ "ray-autoscaler>=1.0" pandas==1.1.5 cloudpickle==1.4.1 rpyc==4.1.5 "dask>=2.12.0" && \
+ conda install -c intel/label/modin -c conda-forge modin==0.8.3
+
+RUN conda activate modin && \
+ conda install -c intel/label/modin -c conda-forge -c intel ${add_extra_channel} \
+ "daal4py>=2021.1" dpcpp_cpp_rt && \
+ conda install -c conda-forge scikit-learn==0.23.2 xgboost && \
+ conda clean --all --yes
+
+COPY training_set.csv "${HOME}/training_set.csv"
+COPY test_set_skiprows.csv "${HOME}/test_set_skiprows.csv"
+COPY training_set_metadata.csv "${HOME}/training_set_metadata.csv"
+COPY test_set_metadata.csv "${HOME}/test_set_metadata.csv"
+
+COPY plasticc-omnisci.py "${HOME}/plasticc-omnisci.py"
+
+CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/plasticc-omnisci.py"]
diff --git a/examples/docker/plasticc-on-omnisci/plasticc-omnisci.py b/examples/docker/plasticc-on-omnisci/plasticc-omnisci.py
new file mode 100644
index 00000000000..246e3c7aba8
--- /dev/null
+++ b/examples/docker/plasticc-on-omnisci/plasticc-omnisci.py
@@ -0,0 +1,237 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import os
+import time
+from collections import OrderedDict
+from functools import partial
+import modin.pandas as pd
+from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer
+
+import numpy as np
+import xgboost as xgb
+
+import daal4py.sklearn as sklearn
+
+sklearn.patch_sklearn()
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+
+################ helper functions ###############################
+def create_dtypes():
+ dtypes = OrderedDict(
+ [
+ ("object_id", "int32"),
+ ("mjd", "float32"),
+ ("passband", "int32"),
+ ("flux", "float32"),
+ ("flux_err", "float32"),
+ ("detected", "int32"),
+ ]
+ )
+
+ # load metadata
+ columns_names = [
+ "object_id", "ra", "decl", "gal_l", "gal_b", "ddf", "hostgal_specz",
+ "hostgal_photoz", "hostgal_photoz_err", "distmod", "mwebv", "target",
+ ]
+ meta_dtypes = ["int32"] + ["float32"] * 4 + ["int32"] + ["float32"] * 5 + ["int32"]
+ meta_dtypes = OrderedDict(
+ [(columns_names[i], meta_dtypes[i]) for i in range(len(meta_dtypes))]
+ )
+ return dtypes, meta_dtypes
+
+
+def trigger_read_op(dfs: tuple):
+ for df in dfs:
+ df.shape # to trigger real execution
+ return dfs
+
+
+def ravel_column_names(cols):
+ d0 = cols.get_level_values(0)
+ d1 = cols.get_level_values(1)
+ return ["%s_%s" % (i, j) for i, j in zip(d0, d1)]
+
+
+def measure(name, func, *args, **kw):
+ t0 = time.time()
+ res = func(*args, **kw)
+ t1 = time.time()
+ print(f'{name}: {t1 - t0} sec')
+ return res
+
+
+def all_etl(train, train_meta, test, test_meta):
+ train_final = etl(train, train_meta)
+ test_final = etl(test, test_meta)
+ return (train_final, test_final)
+
+
+def split_step(train_final, test_final):
+ X = train_final.drop(["object_id", "target"], axis=1).values
+ Xt = test_final.drop(["object_id"], axis=1).values
+
+ y = train_final["target"]
+ assert X.shape[1] == Xt.shape[1]
+ classes = sorted(y.unique())
+
+ class_weights = {c: 1 for c in classes}
+ class_weights.update({c: 2 for c in [64, 15]})
+
+ lbl = LabelEncoder()
+ y = lbl.fit_transform(y)
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ X, y, test_size=0.1, stratify=y, random_state=126
+ )
+
+ return X_train, y_train, X_test, y_test, Xt, classes, class_weights
+
+
+def multi_weighted_logloss(y_true, y_preds, classes, class_weights):
+ """
+ refactor from
+ @author olivier https://www.kaggle.com/ogrellier
+ multi logloss for PLAsTiCC challenge
+ """
+ y_p = y_preds.reshape(y_true.shape[0], len(classes), order="F")
+ y_ohe = pd.get_dummies(y_true)
+ y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
+ y_p_log = np.log(y_p)
+ y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
+ nb_pos = y_ohe.sum(axis=0).values.astype(float)
+ class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())])
+ y_w = y_log_ones * class_arr / nb_pos
+
+ loss = -np.sum(y_w) / np.sum(class_arr)
+ return loss
+
+
+def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights):
+ loss = multi_weighted_logloss(y_true.get_label(), y_predicted, classes, class_weights)
+ return "wloss", loss
+
+################ helper functions ###############################
+
+
+def read(dtypes, meta_dtypes):
+ train = pd.read_csv(os.path.expanduser("~/training_set.csv"), dtype=dtypes)
+ test = pd.read_csv(
+ os.path.expanduser("~/test_set_skiprows.csv"),
+ names=list(dtypes.keys()),
+ dtype=dtypes,
+ header=0,
+ )
+
+ train_meta = pd.read_csv(os.path.expanduser("~/training_set_metadata.csv"), dtype=meta_dtypes)
+ target = meta_dtypes.pop("target")
+ test_meta = pd.read_csv(os.path.expanduser("~/test_set_metadata.csv"), dtype=meta_dtypes)
+ meta_dtypes["target"] = target
+
+ dfs = (train, train_meta, test, test_meta)
+ trigger_read_op(dfs)
+ return dfs
+
+
+def etl(df, df_meta):
+ # workaround for both Modin_on_ray and Modin_on_omnisci modes. Eventually this should be fixed
+ df["flux_ratio_sq"] = (df["flux"] / df["flux_err"]) * (
+ df["flux"] / df["flux_err"]
+ ) # np.power(df["flux"] / df["flux_err"], 2.0)
+ df["flux_by_flux_ratio_sq"] = df["flux"] * df["flux_ratio_sq"]
+
+ aggs = {
+ "passband": ["mean"],
+ "flux": ["min", "max", "mean", "skew"],
+ "flux_err": ["min", "max", "mean"],
+ "detected": ["mean"],
+ "mjd": ["max", "min"],
+ "flux_ratio_sq": ["sum"],
+ "flux_by_flux_ratio_sq": ["sum"],
+ }
+ agg_df = df.groupby("object_id", sort=False).agg(aggs)
+
+ agg_df.columns = ravel_column_names(agg_df.columns)
+
+ agg_df["flux_diff"] = agg_df["flux_max"] - agg_df["flux_min"]
+ agg_df["flux_dif2"] = agg_df["flux_diff"] / agg_df["flux_mean"]
+ agg_df["flux_w_mean"] = agg_df["flux_by_flux_ratio_sq_sum"] / agg_df["flux_ratio_sq_sum"]
+ agg_df["flux_dif3"] = agg_df["flux_diff"] / agg_df["flux_w_mean"]
+ agg_df["mjd_diff"] = agg_df["mjd_max"] - agg_df["mjd_min"]
+
+ agg_df = agg_df.drop(["mjd_max", "mjd_min"], axis=1)
+
+ agg_df = agg_df.reset_index()
+
+ df_meta = df_meta.drop(["ra", "decl", "gal_l", "gal_b"], axis=1)
+
+ df_meta = df_meta.merge(agg_df, on="object_id", how="left")
+
+ df_meta.shape # to trigger real execution
+ return df_meta
+
+
+def ml(train_final, test_final):
+ X_train, y_train, X_test, y_test, Xt, classes, class_weights = split_step(train_final, test_final)
+
+ cpu_params = {
+ "objective": "multi:softprob",
+ "eval_metric": "merror",
+ "tree_method": "hist",
+ "nthread": 16,
+ "num_class": 14,
+ "max_depth": 7,
+ "verbosity": 1,
+ "subsample": 0.7,
+ "colsample_bytree": 0.7,
+ }
+
+ func_loss = partial(xgb_multi_weighted_logloss, classes=classes, class_weights=class_weights)
+
+ dtrain = xgb.DMatrix(data=X_train, label=y_train)
+ dvalid = xgb.DMatrix(data=X_test, label=y_test)
+ dtest = xgb.DMatrix(data=Xt)
+
+ watchlist = [(dvalid, "eval"), (dtrain, "train")]
+
+ clf = xgb.train(
+ cpu_params,
+ dtrain=dtrain,
+ num_boost_round=60,
+ evals=watchlist,
+ feval=func_loss,
+ early_stopping_rounds=10,
+ verbose_eval=None,
+ )
+
+ yp = clf.predict(dvalid)
+ cpu_loss = multi_weighted_logloss(y_test, yp, classes, class_weights)
+ ysub = clf.predict(dtest) # noqa: F841 (unused variable)
+
+ return cpu_loss
+
+
+def main():
+ dtypes, meta_dtypes = create_dtypes()
+
+ train, train_meta, test, test_meta = measure('Reading', read, dtypes, meta_dtypes)
+ train_final, test_final = measure("ETL", all_etl, train, train_meta, test, test_meta)
+ cpu_loss = measure("ML", ml, train_final, test_final)
+
+ print("validation cpu_loss:", cpu_loss)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/docker/taxi-on-omnisci/build-docker-image.sh b/examples/docker/taxi-on-omnisci/build-docker-image.sh
new file mode 100644
index 00000000000..dcf2c395490
--- /dev/null
+++ b/examples/docker/taxi-on-omnisci/build-docker-image.sh
@@ -0,0 +1,24 @@
+#!/bin/bash -e
+
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+echo "Note: a user is responsible for preparing the dataset.
+The dataset must be named as 'trips_xaa.csv' and be in the folder with 'nyc-taxi-omnisci.dockerfile'.
+It Can be generated by following the instructions on the link:
+'https://github.com/toddwschneider/nyc-taxi-data#instructions'"
+
+cd "`dirname \"$0\"`"
+
+docker build -f nyc-taxi-omnisci.dockerfile -t nyc-taxi-omnisci --build-arg https_proxy --build-arg http_proxy .
+printf "\n\nTo run the benchmark execute:\n\tdocker run --rm nyc-taxi-omnisci\n"
diff --git a/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile
new file mode 100644
index 00000000000..be2349946c3
--- /dev/null
+++ b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile
@@ -0,0 +1,53 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+FROM ubuntu:18.04
+ENV http_proxy ${http_proxy}
+ENV https_proxy ${https_proxy}
+ENV MODIN_BACKEND "omnisci"
+ENV MODIN_EXPERIMENTAL "true"
+
+RUN apt-get update --yes \
+ && apt-get install wget --yes && \
+ rm -rf /var/lib/apt/lists/*
+
+ENV USER modin
+ENV UID 1000
+ENV HOME /home/$USER
+
+RUN adduser --disabled-password \
+ --gecos "Non-root user" \
+ --uid $UID \
+ --home $HOME \
+ $USER
+
+ENV CONDA_DIR ${HOME}/miniconda
+
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \
+ bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \
+ "${CONDA_DIR}/bin/conda" init bash && \
+ rm -f /tmp/miniconda3.sh && \
+ echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
+
+RUN conda update -n base -c defaults conda -y && \
+ conda create -n modin --yes --no-default-packages && \
+ conda activate modin && \
+ conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" "numpy<1.20.0" && \
+ conda clean --all --yes
+
+COPY trips_xaa.csv "${HOME}/trips_xaa.csv"
+COPY nyc-taxi-omnisci.py "${HOME}/nyc-taxi-omnisci.py"
+
+CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/nyc-taxi-omnisci.py"]
diff --git a/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py
new file mode 100644
index 00000000000..535e93727f9
--- /dev/null
+++ b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py
@@ -0,0 +1,108 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import os
+import time
+import modin.pandas as pd
+from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer
+
+def read():
+ columns_names = [
+ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",
+ "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
+ "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
+ "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type",
+ "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall",
+ "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid",
+ "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010",
+ "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma",
+ "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname",
+ "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode",
+ "dropoff_ntaname", "dropoff_puma",
+ ]
+ # use string instead of category
+ columns_types = [
+ "int64", "string", "timestamp", "timestamp", "string", "int64", "float64", "float64",
+ "float64", "float64", "int64", "float64", "float64", "float64", "float64", "float64", "float64",
+ "float64", "float64", "float64", "string", "float64", "string", "string", "string", "float64",
+ "int64", "float64", "int64", "int64", "float64", "float64", "float64", "float64", "string", "float64",
+ "float64", "string", "string", "string", "float64", "float64", "float64", "float64", "string",
+ "float64", "float64", "string", "string", "string", "float64",
+ ]
+
+ dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))}
+ all_but_dates = {
+ col: valtype for (col, valtype) in dtypes.items() if valtype not in ["timestamp"]
+ }
+ dates_only = [col for (col, valtype) in dtypes.items() if valtype in ["timestamp"]]
+
+ df = pd.read_csv(
+ os.path.expanduser('~/trips_xaa.csv'),
+ names=columns_names,
+ dtype=all_but_dates,
+ parse_dates=dates_only,
+ )
+
+ df.shape # to trigger real execution
+ df._query_compiler._modin_frame._partitions[0][
+ 0
+ ].frame_id = OmnisciServer().put_arrow_to_omnisci(
+ df._query_compiler._modin_frame._partitions[0][0].get()
+ ) # to trigger real execution
+ return df
+
+
+def q1_omnisci(df):
+ q1_pandas_output = df.groupby("cab_type").size()
+ q1_pandas_output.shape # to trigger real execution
+ return q1_pandas_output
+
+def q2_omnisci(df):
+ q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"})
+ q2_pandas_output.shape # to trigger real execution
+ return q2_pandas_output
+
+def q3_omnisci(df):
+ df["pickup_datetime"] = df["pickup_datetime"].dt.year
+ q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size()
+ q3_pandas_output.shape # to trigger real execution
+ return q3_pandas_output
+
+def q4_omnisci(df):
+ df["pickup_datetime"] = df["pickup_datetime"].dt.year
+ df["trip_distance"] = df["trip_distance"].astype("int64")
+ q4_pandas_output = (
+ df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False)
+ .size()
+ .reset_index()
+ .sort_values(by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False])
+ )
+ q4_pandas_output.shape # to trigger real execution
+ return q4_pandas_output
+
+def measure(name, func, *args, **kw):
+ t0 = time.time()
+ res = func(*args, **kw)
+ t1 = time.time()
+ print(f'{name}: {t1 - t0} sec')
+ return res
+
+def main():
+ df = measure('Reading', read)
+ measure('Q1', q1_omnisci, df)
+ measure('Q2', q2_omnisci, df)
+ measure('Q3', q3_omnisci, df.copy())
+ measure('Q4', q4_omnisci, df.copy())
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/tutorial/Dockerfile b/examples/tutorial/Dockerfile
new file mode 100644
index 00000000000..d2ead494987
--- /dev/null
+++ b/examples/tutorial/Dockerfile
@@ -0,0 +1,5 @@
+FROM continuumio/miniconda3
+
+RUN conda install -c conda-forge psutil setproctitle
+RUN pip install -r requirements-dev.txt
+
diff --git a/examples/tutorial/README.md b/examples/tutorial/README.md
new file mode 100644
index 00000000000..660ad8c0881
--- /dev/null
+++ b/examples/tutorial/README.md
@@ -0,0 +1,2 @@
+# modin-tutorial
+Tutorial for how to use different features Modin
diff --git a/examples/tutorial/requirements.txt b/examples/tutorial/requirements.txt
new file mode 100644
index 00000000000..7a4bc510d5d
--- /dev/null
+++ b/examples/tutorial/requirements.txt
@@ -0,0 +1,5 @@
+fsspec
+s3fs
+ray==1.0.0
+jupyterlab
+git+https://github.com/modin-project/modin
diff --git a/examples/tutorial/tutorial_notebooks/cluster/exercise_4.ipynb b/examples/tutorial/tutorial_notebooks/cluster/exercise_4.ipynb
new file mode 100644
index 00000000000..af43065f849
--- /dev/null
+++ b/examples/tutorial/tutorial_notebooks/cluster/exercise_4.ipynb
@@ -0,0 +1,146 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "Scale your pandas workflows by changing one line of code
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exercise 4: Setting up cluster environment\n",
+ "\n",
+ "**GOAL**: Learn how to set up a cluster for Modin.\n",
+ "\n",
+ "**NOTE**: This exercise has extra requirements. Read instructions carefully before attempting. \n",
+ "\n",
+ "**This exercise instructs the user on how to start a 700+ core cluster, and it is not shut down until the end of Exercise 5. Read instructions carefully.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Often in practice we have a need to exceed the capabilities of a single machine. Modin works and performs well in both local mode and in a cluster environment. The key advantage of Modin is that your notebook does not change between local development and cluster execution. Users are not required to think about how many workers exist or how to distribute and partition their data; Modin handles all of this seamlessly and transparently.\n",
+ "\n",
+ "\n",
+ "\n",
+ "**Extra Requirements for this exercise**\n",
+ "\n",
+ "Detailed instructions can be found here: https://docs.ray.io/en/master/cluster/launcher.html\n",
+ "\n",
+ "From command line:\n",
+ "- `pip install boto3`\n",
+ "- `aws configure`\n",
+ "- `ray up modin-cluster.yaml`\n",
+ "\n",
+ "Included in this directory is a file named `modin-cluster.yaml`. We will use this to start the cluster."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !pip install boto3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !aws configure"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Starting and connecting to the cluster\n",
+ "\n",
+ "This example starts 1 head node (m5.24xlarge) and 7 workers (m5.24xlarge), 768 total CPUs.\n",
+ "\n",
+ "Cost of this cluster can be found here: https://aws.amazon.com/ec2/pricing/on-demand/."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !ray up modin-cluster.yaml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Connect to the cluster with `ray attach`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !ray attach modin-cluster.yaml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# DO NOT CHANGE THIS CODE!\n",
+ "# Changing this code risks breaking further exercises\n",
+ "\n",
+ "import time\n",
+ "time.sleep(600) # We need to give ray enough time to start up all the workers\n",
+ "import ray\n",
+ "ray.init(address=\"auto\")\n",
+ "from modin.config import NPartitions\n",
+ "assert NPartitions.get() == 768, \"Not all Ray nodes are started up yet\"\n",
+ "ray.shutdown()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Please move on to Exercise 5"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/tutorial/tutorial_notebooks/cluster/exercise_5.ipynb b/examples/tutorial/tutorial_notebooks/cluster/exercise_5.ipynb
new file mode 100644
index 00000000000..06a2de08675
--- /dev/null
+++ b/examples/tutorial/tutorial_notebooks/cluster/exercise_5.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "Scale your pandas workflows by changing one line of code
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exercise 5: Executing on a cluster environment\n",
+ "\n",
+ "**GOAL**: Learn how to connect Modin to a Ray cluster and run pandas queries on a cluster.\n",
+ "\n",
+ "**NOTE**: Exercise 4 must be completed first, this exercise relies on the cluster created in Exercise 4."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Modin performance scales as the number of nodes and cores increases. In this exercise, we will reproduce the data from the plot below.\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Don't change this cell!\n",
+ "import ray\n",
+ "ray.init(address=\"auto\")\n",
+ "import modin.pandas as pd\n",
+ "from modin.config import NPartitions\n",
+ "if NPartitions.get() != 768:\n",
+ " print(\"This notebook was designed and tested for an 8 node Ray cluster. \"\n",
+ " \"Proceed at your own risk!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!du -h big_yellow.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "df = pd.read_csv(\"big_yellow.csv\", quoting=3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "count_result = df.count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# print\n",
+ "count_result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "groupby_result = df.groupby(\"passenger_count\").count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# print\n",
+ "groupby_result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "apply_result = df.applymap(str)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# print\n",
+ "apply_result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ray.shutdown()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Shutting down the cluster\n",
+ "\n",
+ "**You may have to change the path below**. If this does not work, log in to your \n",
+ "\n",
+ "Now that we have finished computation, we can shut down the cluster with `ray down`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!ray down modin-cluster.yaml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### This ends the cluster exercise"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/tutorial/tutorial_notebooks/cluster/modin-cluster.yaml b/examples/tutorial/tutorial_notebooks/cluster/modin-cluster.yaml
new file mode 100644
index 00000000000..78b3be39daa
--- /dev/null
+++ b/examples/tutorial/tutorial_notebooks/cluster/modin-cluster.yaml
@@ -0,0 +1,163 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: modin_init
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 7
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 7
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 7
+
+# Whether or not to autoscale aggressively. If this is enabled, if at any point
+# we would start more workers, we start at least enough to bring us to
+# initial_workers.
+autoscaling_mode: default
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+ image: "" # e.g., rayproject/ray:0.8.7
+ container_name: "" # e.g. ray_docker
+ # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
+ # if no cached version is present.
+ pull_before_run: True
+ run_options: [] # Extra options to pass into "docker run"
+
+ # Example of running a GPU head with CPU workers
+ # head_image: "rayproject/ray:0.8.7-gpu"
+ # head_run_options:
+ # - --runtime=nvidia
+
+ # worker_image: "rayproject/ray:0.8.7"
+ # worker_run_options: []
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This max value allowed is 1.0, which is the most conservative setting.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+ type: aws
+ region: us-west-2
+ # Availability zone(s), comma-separated, that nodes may be launched in.
+ # Nodes are currently spread between zones by a round-robin approach,
+ # however this implementation detail should not be relied upon.
+ availability_zone: us-west-2a,us-west-2b
+ # Whether to allow node reuse. If set to False, nodes will be terminated
+ # instead of stopped.
+ cache_stopped_nodes: True # If not present, the default is True.
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+ ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+# ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+ InstanceType: m5.24xlarge
+ ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+
+ # You can provision additional disk space with a conf as follows
+ BlockDeviceMappings:
+ - DeviceName: /dev/sda1
+ Ebs:
+ VolumeSize: 500
+
+ # Additional options in the boto docs.
+
+# Provider-specific config for worker nodes, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+worker_nodes:
+ InstanceType: m5.24xlarge
+ ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
+
+ BlockDeviceMappings:
+ - DeviceName: /dev/sda1
+ Ebs:
+ VolumeSize: 500
+ # Run workers on spot by default. Comment this out to use on-demand.
+ # InstanceMarketOptions:
+ # MarketType: spot
+ # Additional options can be found in the boto docs, e.g.
+ # SpotOptions:
+ # MaxPrice: MAX_HOURLY_PRICE
+
+ # Additional options in the boto docs.
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+# "/path1/on/remote/machine": "/path1/on/local/machine",
+# "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+ # Note: if you're developing Ray, you probably want to create an AMI that
+ # has your Ray repo pre-cloned. Then, you can replace the pip installs
+ # below with a git checkout (and possibly a recompile).
+ - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
+ - pip install modin
+ - pip install ray==1.0.0
+ - pip install pyarrow==0.16
+ - pip install -U fsspec
+ - wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-01.csv
+ - printf "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount\n" > big_yellow.csv
+ - tail -n +2 yellow_tripdata_2015-01.csv{,}{,}{,}{,}{,}{,} >> big_yellow.csv
+ # Consider uncommenting these if you also want to run apt-get commands during setup
+ # - sudo pkill -9 apt-get || true
+ # - sudo pkill -9 dpkg || true
+ # - sudo dpkg --configure -a
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+ - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+ - ray stop
+ - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+ - ray stop
+ - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
diff --git a/examples/tutorial/tutorial_notebooks/img/MODIN_ver2_hrz.png b/examples/tutorial/tutorial_notebooks/img/MODIN_ver2_hrz.png
new file mode 100644
index 00000000000..6276bd6168c
Binary files /dev/null and b/examples/tutorial/tutorial_notebooks/img/MODIN_ver2_hrz.png differ
diff --git a/examples/tutorial/tutorial_notebooks/img/convert_to_pandas.png b/examples/tutorial/tutorial_notebooks/img/convert_to_pandas.png
new file mode 100644
index 00000000000..1ba62de95ce
Binary files /dev/null and b/examples/tutorial/tutorial_notebooks/img/convert_to_pandas.png differ
diff --git a/examples/tutorial/tutorial_notebooks/img/modin_cluster.png b/examples/tutorial/tutorial_notebooks/img/modin_cluster.png
new file mode 100644
index 00000000000..7bfb190b072
Binary files /dev/null and b/examples/tutorial/tutorial_notebooks/img/modin_cluster.png differ
diff --git a/examples/tutorial/tutorial_notebooks/img/modin_cluster_perf.png b/examples/tutorial/tutorial_notebooks/img/modin_cluster_perf.png
new file mode 100644
index 00000000000..d35e2411c19
Binary files /dev/null and b/examples/tutorial/tutorial_notebooks/img/modin_cluster_perf.png differ
diff --git a/examples/tutorial/tutorial_notebooks/img/modin_multicore.png b/examples/tutorial/tutorial_notebooks/img/modin_multicore.png
new file mode 100644
index 00000000000..9dcd0bbfdf2
Binary files /dev/null and b/examples/tutorial/tutorial_notebooks/img/modin_multicore.png differ
diff --git a/examples/tutorial/tutorial_notebooks/img/pandas_multicore.png b/examples/tutorial/tutorial_notebooks/img/pandas_multicore.png
new file mode 100644
index 00000000000..a56c4279848
Binary files /dev/null and b/examples/tutorial/tutorial_notebooks/img/pandas_multicore.png differ
diff --git a/examples/tutorial/tutorial_notebooks/img/read_csv_perf.png b/examples/tutorial/tutorial_notebooks/img/read_csv_perf.png
new file mode 100644
index 00000000000..7e5f7e8ff63
Binary files /dev/null and b/examples/tutorial/tutorial_notebooks/img/read_csv_perf.png differ
diff --git a/examples/tutorial/tutorial_notebooks/introduction/exercise_1.ipynb b/examples/tutorial/tutorial_notebooks/introduction/exercise_1.ipynb
new file mode 100644
index 00000000000..a2b1de7034b
--- /dev/null
+++ b/examples/tutorial/tutorial_notebooks/introduction/exercise_1.ipynb
@@ -0,0 +1,232 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "Scale your pandas workflows by changing one line of code
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exercise 1: How to use Modin\n",
+ "\n",
+ "**GOAL**: Learn how to import Modin to accelerate and scale pandas workflows."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Modin is a drop-in replacement for pandas that distributes the computation \n",
+ "across all of the cores in your machine or in a cluster.\n",
+ "In practical terms, this means that you can continue using the same pandas scripts\n",
+ "as before and expect the behavior and results to be the same. The only thing that needs\n",
+ "to change is the import statement. Normally, you would change:\n",
+ "\n",
+ "```python\n",
+ "import pandas as pd\n",
+ "```\n",
+ "\n",
+ "to:\n",
+ "\n",
+ "```python\n",
+ "import modin.pandas as pd\n",
+ "```\n",
+ "\n",
+ "Changing this line of code will allow you to use all of the cores in your machine to do computation on your data. One of the major performance bottlenecks of pandas is that it only uses a single core for any given computation. Modin exposes an API that is identical to Pandas, allowing you to continue interacting with your data as you would with Pandas. There are no additional commands required to use Modin locally. Partitioning, scheduling, data transfer, and other related concerns are all handled by Modin under the hood."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
pandas on a multicore laptop\n",
+ " \n",
+ " Modin on a multicore laptop\n",
+ " \n",
+ "\n",
+ "\n",
+ "


\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Concept for exercise: Dataframe constructor\n",
+ "\n",
+ "Often when playing around in Pandas, it is useful to create a DataFrame with the constructor. That is where we will start.\n",
+ "\n",
+ "```python\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n",
+ "df = pd.DataFrame(frame_data)\n",
+ "```\n",
+ "\n",
+ "When creating a dataframe from a non-distributed object, it will take extra time to partition the data. When this is happening, you will see this message:\n",
+ "\n",
+ "```\n",
+ "UserWarning: Distributing object. This may take some time.\n",
+ "```\n",
+ "\n",
+ "**In a later exercise, we will discuss times when it is not possible to speed up the computation, even with multiprocessing or multithreading.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Note: Do not change this code!\n",
+ "import numpy as np\n",
+ "import pandas\n",
+ "import subprocess\n",
+ "import sys\n",
+ "import modin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas.__version__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin.__version__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Implement your answer here. You are also free to play with the size\n",
+ "# and shape of the DataFrame, but beware of exceeding your memory!\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "frame_data = np.random.randint(0, 100, size=(2**10, 2**5))\n",
+ "df = pd.DataFrame(frame_data)\n",
+ "\n",
+ "# ***** Do not change the code below! It verifies that \n",
+ "# ***** the exercise has been done correctly. *****\n",
+ "\n",
+ "try:\n",
+ " assert df is not None\n",
+ " assert frame_data is not None\n",
+ " assert isinstance(frame_data, np.ndarray)\n",
+ "except:\n",
+ " raise AssertionError(\"Don't change too much of the original code!\")\n",
+ "assert \"modin.pandas\" in sys.modules, \"Not quite correct. Remember the single line of code change (See above)\"\n",
+ "\n",
+ "import modin.pandas\n",
+ "assert pd == modin.pandas, \"Remember the single line of code change (See above)\"\n",
+ "assert hasattr(df, \"_query_compiler\"), \"Make sure that `df` is a modin.pandas DataFrame.\"\n",
+ "\n",
+ "print(\"Success! You only need to change one line of code!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we have created a toy example for playing around with the DataFrame, let's print it out in different ways."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Concept for Exercise: Data Interaction and Printing\n",
+ "\n",
+ "When interacting with data, it is very imporant to look at different parts of the data (e.g. `df.head()`). Here we will show that you can print the modin.pandas DataFrame in the same ways you would pandas."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Print the first 10 lines.\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Print the DataFrame.\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Free cell for custom interaction (Play around here!)\n",
+ "df.add_prefix(\"col\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.count()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Please move on to Exercise 2 when you are ready**"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/tutorial/tutorial_notebooks/introduction/exercise_2.ipynb b/examples/tutorial/tutorial_notebooks/introduction/exercise_2.ipynb
new file mode 100644
index 00000000000..65789ed3880
--- /dev/null
+++ b/examples/tutorial/tutorial_notebooks/introduction/exercise_2.ipynb
@@ -0,0 +1,525 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "Scale your pandas workflows by changing one line of code
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exercise 2: Speed improvements\n",
+ "\n",
+ "**GOAL**: Learn about common functionality that Modin speeds up by using all of your machine's cores."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for Exercise: `read_csv` speedups\n",
+ "\n",
+ "The most commonly used data ingestion method used in pandas is CSV files (link to pandas survey). This concept is designed to give an idea of the kinds of speedups possible, even on a non-distributed filesystem. Modin also supports other file formats for parallel and distributed reads, which can be found in the documentation.\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will import both Modin and pandas so that the speedups are evident.\n",
+ "\n",
+ "**Note: Rerunning the `read_csv` cells many times may result in degraded performance, depending on the memory of the machine**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import modin.pandas as pd\n",
+ "import pandas as old_pd\n",
+ "import time\n",
+ "from IPython.display import Markdown, display\n",
+ "\n",
+ "def printmd(string):\n",
+ " display(Markdown(string))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Dataset: 2015 NYC taxi trip data\n",
+ "\n",
+ "Link to raw dataset: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n",
+ "\n",
+ "We will be using a version of this data already in S3, originally posted in this blog post: https://matthewrocklin.com/blog/work/2017/01/12/dask-dataframes\n",
+ "\n",
+ "**Size: ~2GB**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_path = \"s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `pandas.read_csv`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "pandas_df = old_pd.read_csv(s3_path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n",
+ "\n",
+ "end = time.time()\n",
+ "pandas_duration = end - start\n",
+ "print(\"Time to read with pandas: {} seconds\".format(round(pandas_duration, 3)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Expect pandas to take >3 minutes on EC2, longer locally\n",
+ "\n",
+ "This is a good time to chat with your neighbor\n",
+ "Dicussion topics\n",
+ "- Do you work with a large amount of data daily?\n",
+ "- How big is your data?\n",
+ "- What’s the common use case of your data?\n",
+ "- Do you use any big data analytics tools?\n",
+ "- Do you use any interactive analytics tool?\n",
+ "- What’s are some drawbacks of your current interative analytic tools today?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `modin.pandas.read_csv`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "modin_df = pd.read_csv(s3_path, parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"], quoting=3)\n",
+ "\n",
+ "end = time.time()\n",
+ "modin_duration = end - start\n",
+ "print(\"Time to read with Modin: {} seconds\".format(round(modin_duration, 3)))\n",
+ "\n",
+ "printmd(\"### Modin is {}x faster than pandas at `read_csv`!\".format(round(pandas_duration / modin_duration, 2)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Are they equal?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for exercise: Reductions\n",
+ "\n",
+ "In pandas, a reduction would be something along the lines of a `sum` or `count`. It computes some summary statistics about the rows or columns. We will be using `count`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "pandas_count = pandas_df.count()\n",
+ "\n",
+ "end = time.time()\n",
+ "pandas_duration = end - start\n",
+ "\n",
+ "print(\"Time to count with pandas: {} seconds\".format(round(pandas_duration, 3)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "modin_count = modin_df.count()\n",
+ "\n",
+ "end = time.time()\n",
+ "modin_duration = end - start\n",
+ "print(\"Time to count with Modin: {} seconds\".format(round(modin_duration, 3)))\n",
+ "\n",
+ "printmd(\"### Modin is {}x faster than pandas at `count`!\".format(round(pandas_duration / modin_duration, 2)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Are they equal?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas_count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin_count"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for exercise: Map operations\n",
+ "\n",
+ "In pandas, map operations are operations that do a single pass over the data and do not change its shape. Operations like `isnull` and `applymap` are included in this. We will be using `isnull`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "pandas_isnull = pandas_df.isnull()\n",
+ "\n",
+ "end = time.time()\n",
+ "pandas_duration = end - start\n",
+ "\n",
+ "print(\"Time to isnull with pandas: {} seconds\".format(round(pandas_duration, 3)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "modin_isnull = modin_df.isnull()\n",
+ "\n",
+ "end = time.time()\n",
+ "modin_duration = end - start\n",
+ "print(\"Time to isnull with Modin: {} seconds\".format(round(modin_duration, 3)))\n",
+ "\n",
+ "printmd(\"### Modin is {}x faster than pandas at `isnull`!\".format(round(pandas_duration / modin_duration, 2)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Are they equal?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas_isnull"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin_isnull"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for exercise: Apply over a single column\n",
+ "\n",
+ "Sometimes we want to compute some summary statistics on a single column from our dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "rounded_trip_distance_pandas = pandas_df[\"trip_distance\"].apply(round)\n",
+ "\n",
+ "end = time.time()\n",
+ "pandas_duration = end - start\n",
+ "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "rounded_trip_distance_modin = modin_df[\"trip_distance\"].apply(round)\n",
+ "\n",
+ "end = time.time()\n",
+ "modin_duration = end - start\n",
+ "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n",
+ "\n",
+ "printmd(\"### Modin is {}x faster than pandas at `apply` on one column!\".format(round(pandas_duration / modin_duration, 2)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Are they equal?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rounded_trip_distance_pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rounded_trip_distance_modin"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for exercise: Add a column\n",
+ "\n",
+ "It is common to need to add a new column to an existing dataframe, here we show that this is significantly faster in Modin due to metadata management and an efficient zero copy implementation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "pandas_df[\"rounded_trip_distance\"] = rounded_trip_distance_pandas\n",
+ "\n",
+ "end = time.time()\n",
+ "pandas_duration = end - start\n",
+ "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "modin_df[\"rounded_trip_distance\"] = rounded_trip_distance_modin\n",
+ "\n",
+ "end = time.time()\n",
+ "modin_duration = end - start\n",
+ "print(\"Time to add a column with Modin: {} seconds\".format(round(modin_duration, 3)))\n",
+ "\n",
+ "printmd(\"### Modin is {}x faster than pandas add a column!\".format(round(pandas_duration / modin_duration, 2)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Are they equal?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for exercise: Groupby and aggregate\n",
+ "\n",
+ "In pandas, you can groupby and aggregate. We will groupby a column in the dataset and use count for our aggregate."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "pandas_groupby = pandas_df.groupby(by=\"rounded_trip_distance\").count()\n",
+ "\n",
+ "end = time.time()\n",
+ "pandas_duration = end - start\n",
+ "\n",
+ "print(\"Time to groupby with pandas: {} seconds\".format(round(pandas_duration, 3)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "modin_groupby = modin_df.groupby(by=\"rounded_trip_distance\").count()\n",
+ "\n",
+ "end = time.time()\n",
+ "modin_duration = end - start\n",
+ "print(\"Time to groupby with Modin: {} seconds\".format(round(modin_duration, 3)))\n",
+ "\n",
+ "printmd(\"### Modin is {}x faster than pandas at `groupby`!\".format(round(pandas_duration / modin_duration, 2)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Are they equal?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas_groupby"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin_groupby"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Please move on to tutorial 3"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/tutorial/tutorial_notebooks/introduction/exercise_3.ipynb b/examples/tutorial/tutorial_notebooks/introduction/exercise_3.ipynb
new file mode 100644
index 00000000000..6de80d3175a
--- /dev/null
+++ b/examples/tutorial/tutorial_notebooks/introduction/exercise_3.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "Scale your pandas workflows by changing one line of code
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exercise 3: Not Implemented\n",
+ "\n",
+ "**GOAL**: Learn what happens when a function is not yet supported in Modin and what functionality is not possible to accelerate"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "When functionality has not yet been implemented, we default to pandas\n",
+ "\n",
+ "\n",
+ "\n",
+ "We convert a Modin dataframe to pandas to do the operation, then convert it back once it is finished. These operations will have a high overhead due to the communication involved and will take longer than pandas.\n",
+ "\n",
+ "When this is happening, a warning will be given to the user to inform them that this operation will take longer than usual. For example, `DataFrame.kurtosis` is not yet implemented. In this case, when a user tries to use it, they will see this warning:\n",
+ "\n",
+ "```\n",
+ "UserWarning: `DataFrame.kurtosis` defaulting to pandas implementation.\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for exercise: Default to pandas\n",
+ "\n",
+ "In this section of the exercise we will see first-hand how the runtime is affected by operations that are not implemented."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import modin.pandas as pd\n",
+ "import pandas\n",
+ "import numpy as np\n",
+ "import time\n",
+ "\n",
+ "frame_data = np.random.randint(0, 100, size=(2**18, 2**8))\n",
+ "df = pd.DataFrame(frame_data).add_prefix(\"col\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas_df = pandas.DataFrame(frame_data).add_prefix(\"col\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin_start = time.time()\n",
+ "\n",
+ "print(df.kurtosis())\n",
+ "\n",
+ "modin_end = time.time()\n",
+ "print(\"Modin kurtosis took {} seconds.\".format(round(modin_end - modin_start, 4)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pandas_start = time.time()\n",
+ "\n",
+ "print(pandas_df.kurtosis())\n",
+ "\n",
+ "pandas_end = time.time()\n",
+ "print(\"pandas kurtosis took {} seconds.\".format(round(pandas_end - pandas_start, 4)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Concept for exercise: Register custom functions\n",
+ "\n",
+ "Modin's user-facing API is pandas, but it is possible that we do not yet support your favorite or most-needed functionalities. Your user-defined function may also be able to be executed more efficiently if you pre-define the type of function it is (e.g. map, reduction, etc.). To solve either case, it is possible to register a custom function to be applied to your data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Registering a custom function for all query compilers\n",
+ "\n",
+ "To register a custom function for a query compiler, we first need to import it:\n",
+ "\n",
+ "```python\n",
+ "from modin.backends.pandas.query_compiler import PandasQueryCompiler\n",
+ "```\n",
+ "\n",
+ "The `PandasQueryCompiler` is responsible for defining and compiling the queries that can be operated on by Modin, and is specific to the pandas backend. Any queries defined here must also both be compatible with and result in a `pandas.DataFrame`. Many functionalities are very simply implemented, as you can see in the current code: [Link](https://github.com/modin-project/modin/blob/f15fb8ea776ed039893130b1e85053e875912d4b/modin/backends/pandas/query_compiler.py#L365).\n",
+ "\n",
+ "If we want to register a new function, we next to understand what kind of function it is. In our example, we will use `kurtosis`, which is a reduction. So we next want to import the function type so we can use it in our definition:\n",
+ "\n",
+ "```python\n",
+ "from modin.data_management.functions import ReductionFunction\n",
+ "```\n",
+ "\n",
+ "Then we can just use the `ReductionFunction.register` `classmethod` and assign it to the `PandasQueryCompiler`:\n",
+ "\n",
+ "```python\n",
+ "PandasQueryCompiler.kurtosis = ReductionFunction.register(pandas.DataFrame.kurtosis)\n",
+ "```\n",
+ "\n",
+ "Finally, we want a handle to it from the `DataFrame`, so we need to create a way to do that:\n",
+ "\n",
+ "```python\n",
+ "def kurtosis_func(self, **kwargs):\n",
+ " # The constructor allows you to pass in a query compiler as a keyword argument\n",
+ " return self.__constructor__(query_compiler=self._query_compiler.kurtosis(**kwargs))\n",
+ "\n",
+ "pd.DataFrame.kurtosis_custom = kurtosis_func\n",
+ "```\n",
+ "\n",
+ "And then you can use it like you usually would:\n",
+ "\n",
+ "```python\n",
+ "df.kurtosis_custom()\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from modin.backends.pandas.query_compiler import PandasQueryCompiler\n",
+ "from modin.data_management.functions import ReductionFunction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "PandasQueryCompiler.kurtosis_custom = ReductionFunction.register(pandas.DataFrame.kurtosis)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The function signature came from the pandas documentation:\n",
+ "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.kurtosis.html\n",
+ "def kurtosis_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):\n",
+ " # We need to specify the axis for the backend\n",
+ " if axis is None:\n",
+ " axis = 0\n",
+ " # The constructor allows you to pass in a query compiler as a keyword argument\n",
+ " # Reduce dimension is used for reductions\n",
+ " # We also pass all keyword arguments here to ensure correctness\n",
+ " return self._reduce_dimension(\n",
+ " self._query_compiler.kurtosis_custom(\n",
+ " axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs\n",
+ " )\n",
+ " )\n",
+ "\n",
+ "pd.DataFrame.kurtosis_custom = kurtosis_func"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "print(df.kurtosis())\n",
+ "\n",
+ "end = time.time()\n",
+ "print(\"Modin kurtosis took {} seconds.\".format(end - start))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start = time.time()\n",
+ "\n",
+ "print(df.kurtosis_custom())\n",
+ "\n",
+ "end = time.time()\n",
+ "print(\"Modin kurtosis_custom took {} seconds.\".format(end - start))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Congratulations! You have just implemented `DataFrame.kurtosis`!\n",
+ "\n",
+ "## Consider opening a pull request: https://github.com/modin-project/modin/pulls\n",
+ "\n",
+ "For a complete list of what is implemented, see the [documentation](https://modin.readthedocs.io/en/latest/UsingPandasonRay/dataframe_supported.html)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test your knowledge: Add a custom function for another reduction: `DataFrame.mad`\n",
+ "\n",
+ "See the pandas documentation for the correct signature: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mad.html"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modin_mad_start = time.time()\n",
+ "\n",
+ "# Implement your function here! Put the result of your custom `mad` in the variable `modin_mad`\n",
+ "# Hint: Look at the kurtosis walkthrough above\n",
+ "modin_mad = ...\n",
+ "\n",
+ "modin_mad_end = time.time()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Evaluation code, do not change!\n",
+ "pandas_mad_start = time.time()\n",
+ "pandas_mad = pandas_df.mad()\n",
+ "pandas_mad_end = time.time()\n",
+ "\n",
+ "assert isinstance(modin_mad, pd.Series), \"This is not a distributed Modin object, try again\"\n",
+ "assert pandas_mad_end - pandas_mad_start > modin_mad_end - modin_mad_start, \\\n",
+ " \"Your implementation was too slow, or you used the defaulting to pandas approach. Try again\"\n",
+ "assert modin_mad._to_pandas().equals(pandas_mad), \"Your result did not match the result of pandas, tray again\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now that you are able to create custom functions, you know enough to contribute to Modin!"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py
index a1e1b18a4b6..a8a53b49d34 100644
--- a/modin/backends/base/query_compiler.py
+++ b/modin/backends/base/query_compiler.py
@@ -23,15 +23,18 @@
CatDefault,
GroupByDefault,
)
+from modin.error_message import ErrorMessage
from pandas.core.dtypes.common import is_scalar
import pandas.core.resample
import pandas
import numpy as np
+from typing import List, Hashable
def _get_axis(axis):
def axis_getter(self):
+ ErrorMessage.default_to_pandas(f"DataFrame.get_axis({axis})")
return self.to_pandas().axes[axis]
return axis_getter
@@ -56,7 +59,8 @@ class BaseQueryCompiler(abc.ABC):
@abc.abstractmethod
def default_to_pandas(self, pandas_op, *args, **kwargs):
- """Default to pandas behavior.
+ """
+ Default to pandas behavior.
Parameters
----------
@@ -134,10 +138,9 @@ def concat(df, axis, other, **kwargs):
else:
if isinstance(other, (list, np.ndarray)) and len(other) == 1:
other = other[0]
- how = kwargs.pop("join", None)
ignore_index = kwargs.pop("ignore_index", None)
- kwargs["how"] = how
- result = df.join(other, **kwargs)
+ kwargs["how"] = kwargs.pop("join", None)
+ result = df.join(other, rsuffix="r_", **kwargs)
if ignore_index:
if axis == 0:
result = result.reset_index(drop=True)
@@ -482,6 +485,30 @@ def reset_index(self, **kwargs):
"""
return DataFrameDefault.register(pandas.DataFrame.reset_index)(self, **kwargs)
+ def set_index_from_columns(
+ self, keys: List[Hashable], drop: bool = True, append: bool = False
+ ):
+ """Create new row labels from a list of columns.
+
+ Parameters
+ ----------
+ keys : list of hashable
+ The list of column names that will become the new index.
+ drop : boolean
+ Whether or not to drop the columns provided in the `keys` argument.
+ append : boolean
+ Whether or not to add the columns in `keys` as new levels appended to the
+ existing index.
+
+ Returns
+ -------
+ PandasQueryCompiler
+ A new QueryCompiler with updated index.
+ """
+ return DataFrameDefault.register(pandas.DataFrame.set_index)(
+ self, keys=keys, drop=drop, append=append
+ )
+
# END Abstract reindex/reset_index
# Full Reduce operations
@@ -491,14 +518,14 @@ def reset_index(self, **kwargs):
# we will implement a Distributed Series, and this will be returned
# instead.
- def is_monotonic(self):
+ def is_monotonic_increasing(self):
"""Return boolean if values in the object are monotonic_increasing.
Returns
-------
bool
"""
- return SeriesDefault.register(pandas.Series.is_monotonic)(self)
+ return SeriesDefault.register(pandas.Series.is_monotonic_increasing)(self)
def is_monotonic_decreasing(self):
"""Return boolean if values in the object are monotonic_decreasing.
@@ -1392,26 +1419,33 @@ def groupby_size(
reduce_args=reduce_args,
numeric_only=numeric_only,
drop=drop,
+ method="size",
)
- def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False):
+ def groupby_agg(
+ self,
+ by,
+ is_multi_by,
+ axis,
+ agg_func,
+ agg_args,
+ agg_kwargs,
+ groupby_kwargs,
+ drop=False,
+ ):
+ if isinstance(by, type(self)) and len(by.columns) == 1:
+ by = by.columns[0] if drop else by.to_pandas().squeeze()
+ elif isinstance(by, type(self)):
+ by = list(by.columns)
+
return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.aggregate)(
self,
by=by,
+ is_multi_by=is_multi_by,
axis=axis,
agg_func=agg_func,
- groupby_args=groupby_args,
- agg_args=agg_args,
- drop=drop,
- )
-
- def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False):
- return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.aggregate)(
- self,
- by=by,
- func_dict=func_dict,
- groupby_args=groupby_args,
- agg_args=agg_args,
+ groupby_args=groupby_kwargs,
+ agg_args=agg_kwargs,
drop=drop,
)
@@ -1493,6 +1527,21 @@ def repeat(self, repeats):
index = property(_get_axis(0), _set_axis(0))
columns = property(_get_axis(1), _set_axis(1))
+ def get_axis(self, axis):
+ """
+ Return index labels of the specified axis.
+
+ Parameters
+ ----------
+ axis: int,
+ Axis to return labels on.
+
+ Returns
+ -------
+ Index
+ """
+ return self.index if axis == 0 else self.columns
+
def view(self, index=None, columns=None):
index = [] if index is None else index
columns = [] if columns is None else columns
@@ -1502,6 +1551,43 @@ def applyier(df):
return DataFrameDefault.register(applyier)(self)
+ def insert_item(self, axis, loc, value, how="inner", replace=False):
+ """
+ Insert new column/row defined by `value` at the specified `loc`
+
+ Parameters
+ ----------
+ axis: int, axis to insert along
+ loc: int, position to insert `value`
+ value: BaseQueryCompiler, value to insert
+ how : str,
+ The type of join to join to make.
+ replace: bool (default False),
+ Whether to insert item at `loc` or to replace item at `loc`.
+
+ Returns
+ -------
+ A new BaseQueryCompiler
+ """
+ assert isinstance(value, type(self))
+
+ def mask(idx):
+ if len(idx) == len(self.get_axis(axis)):
+ return self
+ return (
+ self.getitem_column_array(idx, numeric=True)
+ if axis
+ else self.getitem_row_array(idx)
+ )
+
+ if 0 <= loc < len(self.get_axis(axis)):
+ first_mask = mask(list(range(loc)))
+ second_mask_loc = loc + 1 if replace else loc
+ second_mask = mask(list(range(second_mask_loc, len(self.get_axis(axis)))))
+ return first_mask.concat(axis, [value, second_mask], join=how, sort=False)
+ else:
+ return self.concat(axis, [value], join=how, sort=False)
+
def setitem(self, axis, key, value):
def setitem(df, axis, key, value):
if is_scalar(key) and isinstance(value, pandas.DataFrame):
@@ -1560,6 +1646,64 @@ def has_multiindex(self, axis=0):
assert axis == 1
return isinstance(self.columns, pandas.MultiIndex)
+ def get_index_name(self, axis=0):
+ """
+ Get index name of specified axis.
+
+ Parameters
+ ----------
+ axis: int (default 0),
+ Axis to return index name on.
+
+ Returns
+ -------
+ hashable
+ Index name, None for MultiIndex.
+ """
+ return self.get_axis(axis).name
+
+ def set_index_name(self, name, axis=0):
+ """
+ Set index name for the specified axis.
+
+ Parameters
+ ----------
+ name: hashable,
+ New index name.
+ axis: int (default 0),
+ Axis to set name along.
+ """
+ self.get_axis(axis).name = name
+
+ def get_index_names(self, axis=0):
+ """
+ Get index names of specified axis.
+
+ Parameters
+ ----------
+ axis: int (default 0),
+ Axis to return index names on.
+
+ Returns
+ -------
+ list
+ Index names.
+ """
+ return self.get_axis(axis).names
+
+ def set_index_names(self, names, axis=0):
+ """
+ Set index names for the specified axis.
+
+ Parameters
+ ----------
+ names: list,
+ New index names.
+ axis: int (default 0),
+ Axis to set names along.
+ """
+ self.get_axis(axis).names = names
+
# DateTime methods
dt_ceil = DateTimeDefault.register(pandas.Series.dt.ceil)
@@ -1765,5 +1909,6 @@ def has_multiindex(self, axis=0):
kurt = DataFrameDefault.register(pandas.DataFrame.kurt)
sum_min_count = DataFrameDefault.register(pandas.DataFrame.sum)
prod_min_count = DataFrameDefault.register(pandas.DataFrame.prod)
+ compare = DataFrameDefault.register(pandas.DataFrame.compare)
# End of DataFrame methods
diff --git a/modin/backends/pandas/parsers.py b/modin/backends/pandas/parsers.py
index 159ccd982e8..8f749a97f22 100644
--- a/modin/backends/pandas/parsers.py
+++ b/modin/backends/pandas/parsers.py
@@ -20,7 +20,7 @@
from pandas.io.common import infer_compression
import warnings
-from modin.engines.base.io import FileReader
+from modin.engines.base.io import FileDispatcher
from modin.data_management.utils import split_result_of_axis_func_pandas
from modin.error_message import ErrorMessage
@@ -99,7 +99,9 @@ def parse(fname, **kwargs):
index_col = kwargs.get("index_col", None)
if start is not None and end is not None:
# pop "compression" from kwargs because bio is uncompressed
- bio = FileReader.file_open(fname, "rb", kwargs.pop("compression", "infer"))
+ bio = FileDispatcher.file_open(
+ fname, "rb", kwargs.pop("compression", "infer")
+ )
if kwargs.get("encoding", None) is not None:
header = b"" + bio.readline()
else:
@@ -122,6 +124,52 @@ def parse(fname, **kwargs):
]
+class PandasCSVGlobParser(PandasCSVParser):
+ @staticmethod
+ def parse(chunks, **kwargs):
+ warnings.filterwarnings("ignore")
+ num_splits = kwargs.pop("num_splits", None)
+ index_col = kwargs.get("index_col", None)
+
+ pandas_dfs = []
+ for fname, start, end in chunks:
+ if start is not None and end is not None:
+ # pop "compression" from kwargs because bio is uncompressed
+ bio = FileDispatcher.file_open(
+ fname, "rb", kwargs.pop("compression", "infer")
+ )
+ if kwargs.get("encoding", None) is not None:
+ header = b"" + bio.readline()
+ else:
+ header = b""
+ bio.seek(start)
+ to_read = header + bio.read(end - start)
+ bio.close()
+ pandas_dfs.append(pandas.read_csv(BytesIO(to_read), **kwargs))
+ else:
+ # This only happens when we are reading with only one worker (Default)
+ return pandas.read_csv(fname, **kwargs)
+
+ # Combine read in data.
+ if len(pandas_dfs) > 1:
+ pandas_df = pandas.concat(pandas_dfs)
+ elif len(pandas_dfs) > 0:
+ pandas_df = pandas_dfs[0]
+ else:
+ pandas_df = pandas.DataFrame()
+
+ # Set internal index.
+ if index_col is not None:
+ index = pandas_df.index
+ else:
+ # The lengths will become the RangeIndex
+ index = len(pandas_df)
+ return _split_result_for_readers(1, num_splits, pandas_df) + [
+ index,
+ pandas_df.dtypes,
+ ]
+
+
class PandasFWFParser(PandasParser):
@staticmethod
def parse(fname, **kwargs):
@@ -131,7 +179,9 @@ def parse(fname, **kwargs):
index_col = kwargs.get("index_col", None)
if start is not None and end is not None:
# pop "compression" from kwargs because bio is uncompressed
- bio = FileReader.file_open(fname, "rb", kwargs.pop("compression", "infer"))
+ bio = FileDispatcher.file_open(
+ fname, "rb", kwargs.pop("compression", "infer")
+ )
if kwargs.get("encoding", None) is not None:
header = b"" + bio.readline()
else:
@@ -203,8 +253,8 @@ def parse(fname, **kwargs):
from openpyxl.worksheet.worksheet import Worksheet
from pandas.core.dtypes.common import is_list_like
from pandas.io.excel._util import (
- _fill_mi_header,
- _maybe_convert_usecols,
+ fill_mi_header,
+ maybe_convert_usecols,
)
from pandas.io.parsers import TextParser
import re
@@ -221,7 +271,7 @@ def parse(fname, **kwargs):
ws = Worksheet(wb)
# Read the raw data
with ZipFile(fname) as z:
- with z.open("xl/worksheets/{}.xml".format(sheet_name.lower())) as file:
+ with z.open("xl/worksheets/{}.xml".format(sheet_name)) as file:
file.seek(start)
bytes_data = file.read(end - start)
@@ -244,21 +294,21 @@ def update_row_nums(match):
"""
b = match.group(0)
return re.sub(
- b"\d+", # noqa: W605
+ br"\d+",
lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows).encode(
"utf-8"
),
b,
)
- bytes_data = re.sub(b'r="[A-Z]*\d+"', update_row_nums, bytes_data) # noqa: W605
+ bytes_data = re.sub(br'r="[A-Z]*\d+"', update_row_nums, bytes_data)
bytesio = BytesIO(excel_header + bytes_data + footer)
# Use openpyxl to read/parse sheet data
reader = WorksheetReader(ws, bytesio, ex.shared_strings, False)
# Attach cells to worksheet object
reader.bind_cells()
data = PandasExcelParser.get_sheet_data(ws, kwargs.pop("convert_float", True))
- usecols = _maybe_convert_usecols(kwargs.pop("usecols", None))
+ usecols = maybe_convert_usecols(kwargs.pop("usecols", None))
header = kwargs.pop("header", 0)
index_col = kwargs.pop("index_col", None)
# skiprows is handled externally
@@ -271,7 +321,7 @@ def update_row_nums(match):
control_row = [True] * len(data[0])
for row in header:
- data[row], control_row = _fill_mi_header(data[row], control_row)
+ data[row], control_row = fill_mi_header(data[row], control_row)
# Handle MultiIndex for row Index if necessary
if is_list_like(index_col):
# Forward fill values for MultiIndex index.
@@ -289,7 +339,6 @@ def update_row_nums(match):
data[row][col] = last
else:
last = data[row][col]
-
parser = TextParser(
data,
header=header,
@@ -302,7 +351,7 @@ def update_row_nums(match):
# In excel if you create a row with only a border (no values), this parser will
# interpret that as a row of NaN values. Pandas discards these values, so we
# also must discard these values.
- pandas_df = parser.read().dropna(how="all")
+ pandas_df = parser.read()
# Since we know the number of rows that occur before this partition, we can
# correctly assign the index in cases of RangeIndex. If it is not a RangeIndex,
# the index is already correct because it came from the data.
@@ -331,7 +380,9 @@ def parse(fname, **kwargs):
end = kwargs.pop("end", None)
if start is not None and end is not None:
# pop "compression" from kwargs because bio is uncompressed
- bio = FileReader.file_open(fname, "rb", kwargs.pop("compression", "infer"))
+ bio = FileDispatcher.file_open(
+ fname, "rb", kwargs.pop("compression", "infer")
+ )
bio.seek(start)
to_read = b"" + bio.read(end - start)
bio.close()
@@ -355,6 +406,17 @@ class PandasParquetParser(PandasParser):
def parse(fname, **kwargs):
num_splits = kwargs.pop("num_splits", None)
columns = kwargs.get("columns", None)
+ if fname.startswith("s3://"):
+ from botocore.exceptions import NoCredentialsError
+ import s3fs
+
+ try:
+ fs = s3fs.S3FileSystem()
+ fname = fs.open(fname)
+ except NoCredentialsError:
+ fs = s3fs.S3FileSystem(anon=True)
+ fname = fs.open(fname)
+
if num_splits is None:
return pandas.read_parquet(fname, **kwargs)
kwargs["use_pandas_metadata"] = True
diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
index 5615f6a6e05..3a060be73b2 100644
--- a/modin/backends/pandas/query_compiler.py
+++ b/modin/backends/pandas/query_compiler.py
@@ -15,18 +15,21 @@
import pandas
from pandas.core.common import is_bool_indexer
from pandas.core.indexing import check_bool_indexer
+from pandas.core.indexes.api import ensure_index_from_sequences
from pandas.core.dtypes.common import (
is_list_like,
is_numeric_dtype,
is_datetime_or_timedelta_dtype,
- is_scalar,
)
from pandas.core.base import DataError
+from collections.abc import Iterable, Container
+from typing import List, Hashable
import warnings
+
from modin.backends.base.query_compiler import BaseQueryCompiler
from modin.error_message import ErrorMessage
-from modin.utils import try_cast_to_pandas, wrap_udf_function
+from modin.utils import try_cast_to_pandas, wrap_udf_function, hashable
from modin.data_management.functions import (
FoldFunction,
MapFunction,
@@ -34,6 +37,7 @@
ReductionFunction,
BinaryFunction,
GroupbyReduceFunction,
+ groupby_reduce_functions,
)
@@ -158,7 +162,8 @@ def __init__(self, modin_frame):
self._modin_frame = modin_frame
def default_to_pandas(self, pandas_op, *args, **kwargs):
- """Default to pandas behavior.
+ """
+ Default to pandas behavior.
Parameters
----------
@@ -174,11 +179,12 @@ def default_to_pandas(self, pandas_op, *args, **kwargs):
PandasQueryCompiler
The result of the `pandas_op`, converted back to PandasQueryCompiler
- Note
- ----
+ Notes
+ -----
This operation takes a distributed object and converts it directly to pandas.
"""
- ErrorMessage.default_to_pandas(str(pandas_op))
+ op_name = getattr(pandas_op, "__name__", str(pandas_op))
+ ErrorMessage.default_to_pandas(op_name)
args = (a.to_pandas() if isinstance(a, type(self)) else a for a in args)
kwargs = {
k: v.to_pandas if isinstance(v, type(self)) else v
@@ -529,19 +535,61 @@ def reset_index(self, **kwargs):
if level is not None or self.has_multiindex():
return self.default_to_pandas(pandas.DataFrame.reset_index, **kwargs)
if not drop:
- new_column_name = (
- self.index.name
- if self.index.name is not None
- else "index"
- if "index" not in self.columns
- else "level_0"
- )
- new_self = self.insert(0, new_column_name, self.index)
- else:
- new_self = self.copy()
+ return self.__constructor__(self._modin_frame.from_labels())
+ new_self = self.copy()
new_self.index = pandas.RangeIndex(len(new_self.index))
return new_self
+ def set_index_from_columns(
+ self, keys: List[Hashable], drop: bool = True, append: bool = False
+ ):
+ """Create new row labels from a list of columns.
+
+ Parameters
+ ----------
+ keys : list of hashable
+ The list of column names that will become the new index.
+ drop : boolean
+ Whether or not to drop the columns provided in the `keys` argument
+ append : boolean
+ Whether or not to add the columns in `keys` as new levels appended to the
+ existing index.
+
+ Returns
+ -------
+ PandasQueryCompiler
+ A new QueryCompiler with updated index.
+ """
+ new_modin_frame = self._modin_frame.to_labels(keys)
+ if append:
+ arrays = []
+ # Appending keeps the original order of the index levels, then appends the
+ # new index objects.
+ names = list(self.index.names)
+ if isinstance(self.index, pandas.MultiIndex):
+ for i in range(self.index.nlevels):
+ arrays.append(self.index._get_level_values(i))
+ else:
+ arrays.append(self.index)
+
+ # Add the names in the correct order.
+ names.extend(new_modin_frame.index.names)
+ if isinstance(new_modin_frame.index, pandas.MultiIndex):
+ for i in range(new_modin_frame.index.nlevels):
+ arrays.append(new_modin_frame.index._get_level_values(i))
+ else:
+ arrays.append(new_modin_frame.index)
+ new_modin_frame.index = ensure_index_from_sequences(arrays, names)
+ if not drop:
+ # The algebraic operator for this operation always drops the column, but we
+ # can copy the data in this object and just use the index from the result of
+ # the query compiler call.
+ result = self._modin_frame.copy()
+ result.index = new_modin_frame.index
+ else:
+ result = new_modin_frame
+ return self.__constructor__(result)
+
# END Reindex/reset_index
# Transpose
@@ -589,46 +637,21 @@ def is_series_like(self):
# MapReduce operations
- def _is_monotonic(self, func_type=None):
- funcs = {
- "increasing": lambda df: df.is_monotonic_increasing,
- "decreasing": lambda df: df.is_monotonic_decreasing,
- }
-
- monotonic_fn = funcs.get(func_type, funcs["increasing"])
-
- def is_monotonic_map(df):
- df = df.squeeze(axis=1)
- return [monotonic_fn(df), df.iloc[0], df.iloc[len(df) - 1]]
-
- def is_monotonic_reduce(df):
- df = df.squeeze(axis=1)
-
- common_case = df[0].all()
- left_edges = df[1]
- right_edges = df[2]
-
- edges_list = []
- for i in range(len(left_edges)):
- edges_list.extend([left_edges.iloc[i], right_edges.iloc[i]])
-
- edge_case = monotonic_fn(pandas.Series(edges_list))
- return [common_case and edge_case]
+ def is_monotonic_decreasing(self):
+ def is_monotonic_decreasing(df):
+ return pandas.DataFrame([df.squeeze(axis=1).is_monotonic_decreasing])
- return MapReduceFunction.register(
- is_monotonic_map, is_monotonic_reduce, axis=0
- )(self)
+ return self.default_to_pandas(is_monotonic_decreasing)
- def is_monotonic_decreasing(self):
- return self._is_monotonic(func_type="decreasing")
+ def is_monotonic_increasing(self):
+ def is_monotonic_increasing(df):
+ return pandas.DataFrame([df.squeeze(axis=1).is_monotonic_increasing])
- is_monotonic = _is_monotonic
+ return self.default_to_pandas(is_monotonic_increasing)
count = MapReduceFunction.register(pandas.DataFrame.count, pandas.DataFrame.sum)
- max = MapReduceFunction.register(pandas.DataFrame.max, pandas.DataFrame.max)
- min = MapReduceFunction.register(pandas.DataFrame.min, pandas.DataFrame.min)
- sum = MapReduceFunction.register(pandas.DataFrame.sum, pandas.DataFrame.sum)
- prod = MapReduceFunction.register(pandas.DataFrame.prod, pandas.DataFrame.prod)
+ sum = MapReduceFunction.register(pandas.DataFrame.sum)
+ prod = MapReduceFunction.register(pandas.DataFrame.prod)
any = MapReduceFunction.register(pandas.DataFrame.any, pandas.DataFrame.any)
all = MapReduceFunction.register(pandas.DataFrame.all, pandas.DataFrame.all)
memory_usage = MapReduceFunction.register(
@@ -636,18 +659,69 @@ def is_monotonic_decreasing(self):
lambda x, *args, **kwargs: pandas.DataFrame.sum(x),
axis=0,
)
- mean = MapReduceFunction.register(
- lambda df, **kwargs: df.apply(
- lambda x: (x.sum(skipna=kwargs.get("skipna", True)), x.count()),
- axis=kwargs.get("axis", 0),
- result_type="reduce",
- ).set_axis(df.axes[kwargs.get("axis", 0) ^ 1], axis=0),
- lambda df, **kwargs: df.apply(
- lambda x: x.apply(lambda d: d[0]).sum(skipna=kwargs.get("skipna", True))
- / x.apply(lambda d: d[1]).sum(skipna=kwargs.get("skipna", True)),
- axis=kwargs.get("axis", 0),
- ).set_axis(df.axes[kwargs.get("axis", 0) ^ 1], axis=0),
- )
+
+ def max(self, axis, **kwargs):
+ def map_func(df, **kwargs):
+ return pandas.DataFrame.max(df, **kwargs)
+
+ def reduce_func(df, **kwargs):
+ if kwargs.get("numeric_only", False):
+ kwargs = kwargs.copy()
+ kwargs["numeric_only"] = False
+ return pandas.DataFrame.max(df, **kwargs)
+
+ return MapReduceFunction.register(map_func, reduce_func)(
+ self, axis=axis, **kwargs
+ )
+
+ def min(self, axis, **kwargs):
+ def map_func(df, **kwargs):
+ return pandas.DataFrame.min(df, **kwargs)
+
+ def reduce_func(df, **kwargs):
+ if kwargs.get("numeric_only", False):
+ kwargs = kwargs.copy()
+ kwargs["numeric_only"] = False
+ return pandas.DataFrame.min(df, **kwargs)
+
+ return MapReduceFunction.register(map_func, reduce_func)(
+ self, axis=axis, **kwargs
+ )
+
+ def mean(self, axis, **kwargs):
+ if kwargs.get("level") is not None:
+ return self.default_to_pandas(pandas.DataFrame.mean, axis=axis, **kwargs)
+
+ skipna = kwargs.get("skipna", True)
+
+ # TODO-FIX: this function may work incorrectly with user-defined "numeric" values.
+ # Since `count(numeric_only=True)` discards all unknown "numeric" types, we can get incorrect
+ # divisor inside the reduce function.
+ def map_fn(df, **kwargs):
+ result = pandas.DataFrame(
+ {
+ "sum": df.sum(axis=axis, skipna=skipna),
+ "count": df.count(axis=axis, numeric_only=True),
+ }
+ )
+ return result if axis else result.T
+
+ def reduce_fn(df, **kwargs):
+ sum_cols = df["sum"] if axis else df.loc["sum"]
+ count_cols = df["count"] if axis else df.loc["count"]
+
+ if not isinstance(sum_cols, pandas.Series):
+ # If we got `NaN` as the result of the sum in any axis partition,
+ # then we must consider the whole sum as `NaN`, so setting `skipna=False`
+ sum_cols = sum_cols.sum(axis=axis, skipna=False)
+ count_cols = count_cols.sum(axis=axis, skipna=False)
+ return sum_cols / count_cols
+
+ return MapReduceFunction.register(
+ map_fn,
+ reduce_fn,
+ preserve_index=(kwargs.get("numeric_only") is not None),
+ )(self, axis=axis, **kwargs)
def value_counts(self, **kwargs):
"""
@@ -657,91 +731,11 @@ def value_counts(self, **kwargs):
-------
PandasQueryCompiler
"""
- if kwargs.get("bins", None) is not None:
- new_modin_frame = self._modin_frame._apply_full_axis(
- 0, lambda df: df.squeeze(axis=1).value_counts(**kwargs)
- )
- return self.__constructor__(new_modin_frame)
-
- def map_func(df, *args, **kwargs):
- return df.squeeze(axis=1).value_counts(**kwargs)
-
- def reduce_func(df, *args, **kwargs):
- normalize = kwargs.get("normalize", False)
- sort = kwargs.get("sort", True)
- ascending = kwargs.get("ascending", False)
- dropna = kwargs.get("dropna", True)
-
- try:
- result = df.squeeze(axis=1).groupby(df.index, sort=False).sum()
- # This will happen with Arrow buffer read-only errors. We don't want to copy
- # all the time, so this will try to fast-path the code first.
- except (ValueError):
- result = df.copy().squeeze(axis=1).groupby(df.index, sort=False).sum()
-
- if not dropna and np.nan in df.index:
- result = result.append(
- pandas.Series(
- [df.squeeze(axis=1).loc[[np.nan]].sum()], index=[np.nan]
- )
- )
- if normalize:
- result = result / df.squeeze(axis=1).sum()
-
- result = result.sort_values(ascending=ascending) if sort else result
-
- # We want to sort both values and indices of the result object.
- # This function will sort indices for equal values.
- def sort_index_for_equal_values(result, ascending):
- """
- Sort indices for equal values of result object.
-
- Parameters
- ----------
- result : pandas.Series or pandas.DataFrame with one column
- The object whose indices for equal values is needed to sort.
- ascending : boolean
- Sort in ascending (if it is True) or descending (if it is False) order.
-
- Returns
- -------
- pandas.DataFrame
- A new DataFrame with sorted indices.
- """
- is_range = False
- is_end = False
- i = 0
- new_index = np.empty(len(result), dtype=type(result.index))
- while i < len(result):
- j = i
- if i < len(result) - 1:
- while result[result.index[i]] == result[result.index[i + 1]]:
- i += 1
- if is_range is False:
- is_range = True
- if i == len(result) - 1:
- is_end = True
- break
- if is_range:
- k = j
- for val in sorted(
- result.index[j : i + 1], reverse=not ascending
- ):
- new_index[k] = val
- k += 1
- if is_end:
- break
- is_range = False
- else:
- new_index[j] = result.index[j]
- i += 1
- return pandas.DataFrame(result, index=new_index)
- return sort_index_for_equal_values(result, ascending)
+ def value_counts(df):
+ return df.squeeze(axis=1).value_counts(**kwargs).to_frame()
- return MapReduceFunction.register(map_func, reduce_func, preserve_index=False)(
- self, **kwargs
- )
+ return self.default_to_pandas(value_counts)
# END MapReduce operations
@@ -1337,83 +1331,13 @@ def searchsorted(self, **kwargs):
PandasQueryCompiler
"""
- def map_func(part, *args, **kwargs):
-
- elements_number = len(part.index)
- assert elements_number > 0, "Wrong mapping behaviour of MapReduce"
-
- # unify value type
- value = kwargs.pop("value")
- value = np.array([value]) if is_scalar(value) else value
-
- if elements_number == 1:
- part = part[part.columns[0]]
- else:
- part = part.squeeze()
-
- part_index_start = part.index.start
- part_index_stop = part.index.stop
-
- result = part.searchsorted(value=value, *args, **kwargs)
-
- processed_results = {}
- value_number = 0
- for value_result in result:
- value_result += part_index_start
-
- if value_result > part_index_start and value_result < part_index_stop:
- processed_results[f"value{value_number}"] = {
- "relative_location": "current_partition",
- "index": value_result,
- }
- elif value_result <= part_index_start:
- processed_results[f"value{value_number}"] = {
- "relative_location": "previoius_partitions",
- "index": part_index_start,
- }
- else:
- processed_results[f"value{value_number}"] = {
- "relative_location": "next_partitions",
- "index": part_index_stop,
- }
-
- value_number += 1
-
- return pandas.DataFrame(processed_results)
-
- def reduce_func(map_results, *args, **kwargs):
- def get_value_index(value_result):
- value_result_grouped = value_result.groupby(level=0)
- rel_location = value_result_grouped.get_group("relative_location")
- ind = value_result_grouped.get_group("index")
- # executes if result is inside of the mapped part
- if "current_partition" in rel_location.values:
- assert (
- rel_location[rel_location == "current_partition"].count() == 1
- ), "Each value should have single result"
- return ind[rel_location.values == "current_partition"]
- # executes if result is between mapped parts
- elif rel_location.nunique(dropna=False) > 1:
- return ind[rel_location.values == "previoius_partitions"][0]
- # executes if result is outside of the mapped part
- else:
- if "next_partitions" in rel_location.values:
- return ind[-1]
- else:
- return ind[0]
-
- map_results_parsed = map_results.apply(
- lambda ser: get_value_index(ser)
- ).squeeze()
-
- if isinstance(map_results_parsed, pandas.Series):
- map_results_parsed = map_results_parsed.to_list()
-
- return pandas.Series(map_results_parsed)
+ def searchsorted(df):
+ result = df.squeeze(axis=1).searchsorted(**kwargs)
+ if not is_list_like(result):
+ result = [result]
+ return pandas.DataFrame(result)
- return MapReduceFunction.register(map_func, reduce_func, preserve_index=False)(
- self, **kwargs
- )
+ return self.default_to_pandas(searchsorted)
# Dt map partitions operations
@@ -1443,12 +1367,19 @@ def get_value_index(value_result):
dt_is_leap_year = MapFunction.register(_dt_prop_map("is_leap_year"))
dt_daysinmonth = MapFunction.register(_dt_prop_map("daysinmonth"))
dt_days_in_month = MapFunction.register(_dt_prop_map("days_in_month"))
- dt_tz = MapReduceFunction.register(
- _dt_prop_map("tz"), lambda df: pandas.DataFrame(df.iloc[0]), axis=0
- )
- dt_freq = MapReduceFunction.register(
- _dt_prop_map("freq"), lambda df: pandas.DataFrame(df.iloc[0]), axis=0
- )
+
+ def dt_tz(self):
+ def datetime_tz(df):
+ return pandas.DataFrame([df.squeeze(axis=1).dt.tz])
+
+ return self.default_to_pandas(datetime_tz)
+
+ def dt_freq(self):
+ def datetime_freq(df):
+ return pandas.DataFrame([df.squeeze(axis=1).dt.freq])
+
+ return self.default_to_pandas(datetime_freq)
+
dt_to_period = MapFunction.register(_dt_func_map("to_period"))
dt_to_pydatetime = MapFunction.register(_dt_func_map("to_pydatetime"))
dt_tz_localize = MapFunction.register(_dt_func_map("tz_localize"))
@@ -1559,6 +1490,20 @@ def describe(self, **kwargs):
.astype(self.dtypes)
.describe(**kwargs)
)
+ new_index = empty_df.index
+
+ # Note: `describe` convert timestamp type to object type
+ # which results in the loss of two values in index: `first` and `last`
+ # for empty DataFrame.
+ datetime_is_numeric = kwargs.get("datetime_is_numeric") or False
+ if not any(map(is_numeric_dtype, empty_df.dtypes)) and not datetime_is_numeric:
+ for col_name in empty_df.dtypes.index:
+ # if previosly type of `col_name` was datetime or timedelta
+ if is_datetime_or_timedelta_dtype(self.dtypes[col_name]):
+ new_index = pandas.Index(
+ empty_df.index.to_list() + ["first"] + ["last"]
+ )
+ break
def describe_builder(df, internal_indices=[]):
return df.iloc[:, internal_indices].describe(**kwargs)
@@ -1568,7 +1513,7 @@ def describe_builder(df, internal_indices=[]):
0,
describe_builder,
empty_df.columns,
- new_index=empty_df.index,
+ new_index=new_index,
new_columns=empty_df.columns,
)
)
@@ -2151,7 +2096,9 @@ def setitem(self, axis, key, value):
Returns:
A new QueryCompiler
"""
+ return self._setitem(axis=axis, key=key, value=value, how=None)
+ def _setitem(self, axis, key, value, how="inner"):
def setitem_builder(df, internal_indices=[]):
df = df.copy()
if len(internal_indices) == 1:
@@ -2168,55 +2115,13 @@ def setitem_builder(df, internal_indices=[]):
if isinstance(value, type(self)):
value.columns = [key]
- if axis == 0:
- idx = self.columns.get_indexer_for([key])[0]
- if 0 < idx < len(self.columns) - 1:
- first_mask = self._modin_frame.mask(
- col_numeric_idx=list(range(idx))
- )
- second_mask = self._modin_frame.mask(
- col_numeric_idx=list(range(idx + 1, len(self.columns)))
- )
- return self.__constructor__(
- first_mask._concat(
- 1, [value._modin_frame, second_mask], "inner", False
- )
- )
- else:
- mask = self.drop(columns=[key])._modin_frame
- if idx == 0:
- return self.__constructor__(
- value._modin_frame._concat(1, [mask], "inner", False)
- )
- else:
- return self.__constructor__(
- mask._concat(1, [value._modin_frame], "inner", False)
- )
- else:
+ if axis == 1:
value = value.transpose()
- idx = self.index.get_indexer_for([key])[0]
- if 0 < idx < len(self.index) - 1:
- first_mask = self._modin_frame.mask(
- row_numeric_idx=list(range(idx))
- )
- second_mask = self._modin_frame.mask(
- row_numeric_idx=list(range(idx + 1, len(self.index)))
- )
- return self.__constructor__(
- first_mask._concat(
- 0, [value._modin_frame, second_mask], "inner", False
- )
- )
- else:
- mask = self.drop(index=[key])._modin_frame
- if idx == 0:
- return self.__constructor__(
- value._modin_frame._concat(0, [mask], "inner", False)
- )
- else:
- return self.__constructor__(
- mask._concat(0, [value._modin_frame], "inner", False)
- )
+ idx = self.get_axis(axis ^ 1).get_indexer_for([key])[0]
+ return self.insert_item(axis ^ 1, idx, value, how, replace=True)
+
+ # TODO: rework by passing list-like values to `_apply_select_indices`
+ # as an item to distribute
if is_list_like(value):
new_modin_frame = self._modin_frame._apply_full_axis_select_indices(
axis,
@@ -2300,14 +2205,13 @@ def insert(self, loc, column, value):
Returns:
A new PandasQueryCompiler with new data inserted.
"""
+
+ if isinstance(value, type(self)):
+ value.columns = [column]
+ return self.insert_item(axis=1, loc=loc, value=value, how=None)
+
if is_list_like(value):
- # TODO make work with another querycompiler object as `value`.
- # This will require aligning the indices with a `reindex` and ensuring that
- # the data is partitioned identically.
- if isinstance(value, pandas.Series):
- value = value.reindex(self.index)
- else:
- value = list(value)
+ value = list(value)
else:
value = [value] * len(self.index)
@@ -2316,6 +2220,8 @@ def insert(df, internal_indices=[]):
df.insert(internal_idx, column, value)
return df
+ # TODO: rework by passing list-like values to `_apply_select_indices`
+ # as an item to distribute
new_modin_frame = self._modin_frame._apply_full_axis_select_indices(
0,
insert,
@@ -2457,92 +2363,275 @@ def _callable_func(self, func, axis, *args, **kwargs):
# nature. They require certain data to exist on the same partition, and
# after the shuffle, there should be only a local map required.
- groupby_count = GroupbyReduceFunction.register(
- lambda df, **kwargs: df.count(**kwargs), lambda df, **kwargs: df.sum(**kwargs)
- )
- groupby_any = GroupbyReduceFunction.register(
- lambda df, **kwargs: df.any(**kwargs), lambda df, **kwargs: df.any(**kwargs)
- )
- groupby_min = GroupbyReduceFunction.register(
- lambda df, **kwargs: df.min(**kwargs), lambda df, **kwargs: df.min(**kwargs)
- )
- groupby_prod = GroupbyReduceFunction.register(
- lambda df, **kwargs: df.prod(**kwargs), lambda df, **kwargs: df.prod(**kwargs)
- )
- groupby_max = GroupbyReduceFunction.register(
- lambda df, **kwargs: df.max(**kwargs), lambda df, **kwargs: df.max(**kwargs)
- )
- groupby_all = GroupbyReduceFunction.register(
- lambda df, **kwargs: df.all(**kwargs), lambda df, **kwargs: df.all(**kwargs)
- )
- groupby_sum = GroupbyReduceFunction.register(
- lambda df, **kwargs: df.sum(**kwargs), lambda df, **kwargs: df.sum(**kwargs)
- )
- groupby_size = GroupbyReduceFunction.register(
- lambda df, **kwargs: pandas.DataFrame(df.size()),
- lambda df, **kwargs: df.sum(),
- method="size",
- )
+ groupby_all = GroupbyReduceFunction.register("all")
+ groupby_any = GroupbyReduceFunction.register("any")
+ groupby_count = GroupbyReduceFunction.register("count")
+ groupby_max = GroupbyReduceFunction.register("max")
+ groupby_min = GroupbyReduceFunction.register("min")
+ groupby_prod = GroupbyReduceFunction.register("prod")
+ groupby_size = GroupbyReduceFunction.register("size", method="size")
+ groupby_sum = GroupbyReduceFunction.register("sum")
+
+ def _groupby_dict_reduce(
+ self, by, axis, agg_func, agg_args, agg_kwargs, groupby_kwargs, drop=False
+ ):
+ map_dict = {}
+ reduce_dict = {}
+ rename_columns = any(
+ not isinstance(fn, str) and isinstance(fn, Iterable)
+ for fn in agg_func.values()
+ )
+ for col, col_funcs in agg_func.items():
+ if not rename_columns:
+ map_dict[col], reduce_dict[col] = groupby_reduce_functions[col_funcs]
+ continue
+
+ if isinstance(col_funcs, str):
+ col_funcs = [col_funcs]
+
+ map_fns = []
+ for i, fn in enumerate(col_funcs):
+ if not isinstance(fn, str) and isinstance(fn, Iterable):
+ new_col_name, func = fn
+ elif isinstance(fn, str):
+ new_col_name, func = fn, fn
+ else:
+ raise TypeError
- def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False):
- """Apply aggregation functions to a grouped dataframe per-column.
+ map_fns.append((new_col_name, groupby_reduce_functions[func][0]))
+ reduced_col_name = (
+ (*col, new_col_name)
+ if isinstance(col, tuple)
+ else (col, new_col_name)
+ )
+ reduce_dict[reduced_col_name] = groupby_reduce_functions[func][1]
+ map_dict[col] = map_fns
+ return GroupbyReduceFunction.register(map_dict, reduce_dict)(
+ query_compiler=self,
+ by=by,
+ axis=axis,
+ groupby_args=groupby_kwargs,
+ map_args=agg_kwargs,
+ reduce_args=agg_kwargs,
+ numeric_only=False,
+ drop=drop,
+ )
- Parameters
- ----------
- by : PandasQueryCompiler
- The column to group by
- func_dict : dict of str, callable/string
- The dictionary mapping of column to function
- groupby_args : dict
- The dictionary of keyword arguments for the group by.
- agg_args : dict
- The dictionary of keyword arguments for the aggregation functions
- drop : bool
- Whether or not to drop the column from the data.
+ def groupby_agg(
+ self,
+ by,
+ is_multi_by,
+ axis,
+ agg_func,
+ agg_args,
+ agg_kwargs,
+ groupby_kwargs,
+ drop=False,
+ ):
+ def is_reduce_fn(fn, deep_level=0):
+ if not isinstance(fn, str) and isinstance(fn, Container):
+ # `deep_level` parameter specifies the number of nested containers that was met:
+ # - if it's 0, then we're outside of container, `fn` could be either function name
+ # or container of function names/renamers.
+ # - if it's 1, then we're inside container of function names/renamers. `fn` must be
+ # either function name or renamer (renamer is some container which length == 2,
+ # the first element is the new column name and the second is the function name).
+ assert deep_level == 0 or (
+ deep_level > 0 and len(fn) == 2
+ ), f"Got the renamer with incorrect length, expected 2 got {len(fn)}."
+ return (
+ all(is_reduce_fn(f, deep_level + 1) for f in fn)
+ if deep_level == 0
+ else is_reduce_fn(fn[1], deep_level + 1)
+ )
+ return isinstance(fn, str) and fn in groupby_reduce_functions
- Returns
- -------
- PandasQueryCompiler
- The result of the per-column aggregations on the grouped dataframe.
- """
- return self.default_to_pandas(
- lambda df: df.groupby(by=by, **groupby_args).agg(func_dict, **agg_args)
- )
+ if isinstance(agg_func, dict) and all(
+ is_reduce_fn(x) for x in agg_func.values()
+ ):
+ return self._groupby_dict_reduce(
+ by, axis, agg_func, agg_args, agg_kwargs, groupby_kwargs, drop
+ )
- def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False):
- # since we're going to modify `groupby_args` dict in a `groupby_agg_builder`,
+ if callable(agg_func):
+ agg_func = wrap_udf_function(agg_func)
+
+ # since we're going to modify `groupby_kwargs` dict in a `groupby_agg_builder`,
# we want to copy it to not propagate these changes into source dict, in case
# of unsuccessful end of function
- groupby_args = groupby_args.copy()
+ groupby_kwargs = groupby_kwargs.copy()
+
+ as_index = groupby_kwargs.get("as_index", True)
+ if isinstance(by, type(self)):
+ # `drop` parameter indicates whether or not 'by' data came
+ # from the `self` frame:
+ # True: 'by' data came from the `self`
+ # False: external 'by' data
+ if drop:
+ internal_by = by.columns
+ by = [by]
+ else:
+ internal_by = []
+ by = [by]
+ else:
+ if not isinstance(by, list):
+ by = [by]
+ internal_by = [o for o in by if hashable(o) and o in self.columns]
+ internal_qc = (
+ [self.getitem_column_array(internal_by)] if len(internal_by) else []
+ )
+
+ by = internal_qc + by[len(internal_by) :]
- as_index = groupby_args.get("as_index", True)
+ broadcastable_by = [o._modin_frame for o in by if isinstance(o, type(self))]
+ not_broadcastable_by = [o for o in by if not isinstance(o, type(self))]
- def groupby_agg_builder(df):
+ def groupby_agg_builder(df, by=None, drop=False, partition_idx=None):
# Set `as_index` to True to track the metadata of the grouping object
# It is used to make sure that between phases we are constructing the
# right index and placing columns in the correct order.
- groupby_args["as_index"] = True
+ groupby_kwargs["as_index"] = True
+
+ internal_by_cols = pandas.Index([])
+ missmatched_cols = pandas.Index([])
+ if by is not None:
+ internal_by_df = by[internal_by]
- def compute_groupby(df):
- grouped_df = df.groupby(by=by, axis=axis, **groupby_args)
+ if isinstance(internal_by_df, pandas.Series):
+ internal_by_df = internal_by_df.to_frame()
+
+ missmatched_cols = internal_by_df.columns.difference(df.columns)
+ df = pandas.concat(
+ [df, internal_by_df[missmatched_cols]],
+ axis=1,
+ copy=False,
+ )
+ internal_by_cols = internal_by_df.columns
+
+ external_by = by.columns.difference(internal_by)
+ external_by_df = by[external_by].squeeze(axis=1)
+
+ if isinstance(external_by_df, pandas.DataFrame):
+ external_by_cols = [o for _, o in external_by_df.iteritems()]
+ else:
+ external_by_cols = [external_by_df]
+
+ by = internal_by_cols.tolist() + external_by_cols
+
+ else:
+ by = []
+
+ by += not_broadcastable_by
+
+ def compute_groupby(df, drop=False, partition_idx=0):
+ grouped_df = df.groupby(by=by, axis=axis, **groupby_kwargs)
try:
- result = agg_func(grouped_df, **agg_args)
+ if isinstance(agg_func, dict):
+ # Filter our keys that don't exist in this partition. This happens when some columns
+ # from this original dataframe didn't end up in every partition.
+ partition_dict = {
+ k: v for k, v in agg_func.items() if k in df.columns
+ }
+ result = grouped_df.agg(partition_dict)
+ else:
+ result = agg_func(grouped_df, **agg_kwargs)
# This happens when the partition is filled with non-numeric data and a
# numeric operation is done. We need to build the index here to avoid
# issues with extracting the index.
except (DataError, TypeError):
result = pandas.DataFrame(index=grouped_df.size().index)
+ if isinstance(result, pandas.Series):
+ result = result.to_frame(
+ result.name if result.name is not None else "__reduced__"
+ )
+
+ result_cols = result.columns
+ result.drop(columns=missmatched_cols, inplace=True, errors="ignore")
+
+ if not as_index:
+ keep_index_levels = len(by) > 1 and any(
+ isinstance(x, pandas.CategoricalDtype)
+ for x in df[internal_by_cols].dtypes
+ )
+
+ if internal_by_cols.nlevels != result_cols.nlevels:
+ cols_to_insert = (
+ pandas.Index([])
+ if keep_index_levels
+ else internal_by_cols.copy()
+ )
+ else:
+ cols_to_insert = (
+ internal_by_cols.intersection(result_cols)
+ if keep_index_levels
+ else internal_by_cols.difference(result_cols)
+ )
+
+ if keep_index_levels:
+ result.drop(
+ columns=cols_to_insert, inplace=True, errors="ignore"
+ )
+
+ drop = True
+ if partition_idx == 0:
+ drop = False
+ if not keep_index_levels:
+ lvls_to_drop = [
+ i
+ for i, name in enumerate(result.index.names)
+ if name not in cols_to_insert
+ ]
+ if len(lvls_to_drop) == result.index.nlevels:
+ drop = True
+ else:
+ result.index = result.index.droplevel(lvls_to_drop)
+
+ if (
+ not isinstance(result.index, pandas.MultiIndex)
+ and result.index.name is None
+ ):
+ drop = True
+
+ result.reset_index(drop=drop, inplace=True)
+
+ new_index_names = [
+ None
+ if isinstance(name, str) and name.startswith("__reduced__")
+ else name
+ for name in result.index.names
+ ]
+
+ cols_to_drop = (
+ result.columns[result.columns.str.match(r"__reduced__.*", na=False)]
+ if hasattr(result.columns, "str")
+ else []
+ )
+
+ result.index.names = new_index_names
+
+ # Not dropping columns if result is Series
+ if len(result.columns) > 1:
+ result.drop(columns=cols_to_drop, inplace=True)
+
return result
try:
- return compute_groupby(df)
+ return compute_groupby(df, drop, partition_idx)
# This will happen with Arrow buffer read-only errors. We don't want to copy
# all the time, so this will try to fast-path the code first.
except (ValueError, KeyError):
- return compute_groupby(df.copy())
+ return compute_groupby(df.copy(), drop, partition_idx)
- new_modin_frame = self._modin_frame._apply_full_axis(
- axis, lambda df: groupby_agg_builder(df)
+ apply_indices = list(agg_func.keys()) if isinstance(agg_func, dict) else None
+
+ new_modin_frame = self._modin_frame.broadcast_apply_full_axis(
+ axis=axis,
+ func=lambda df, by=None, partition_idx=None: groupby_agg_builder(
+ df, by, drop, partition_idx
+ ),
+ other=broadcastable_by,
+ apply_indices=apply_indices,
+ enumerate_partitions=True,
)
result = self.__constructor__(new_modin_frame)
@@ -2552,21 +2641,16 @@ def compute_groupby(df):
# determening type of raised exception by applying `aggfunc`
# to empty DataFrame
try:
- agg_func(
+ pandas.DataFrame(index=[1], columns=[1]).agg(agg_func) if isinstance(
+ agg_func, dict
+ ) else agg_func(
pandas.DataFrame(index=[1], columns=[1]).groupby(level=0),
- **agg_args,
+ **agg_kwargs,
)
except Exception as e:
raise type(e)("No numeric types to aggregate.")
- # Reset `as_index` because it was edited inplace.
- groupby_args["as_index"] = as_index
- if as_index:
- return result
- else:
- if result.index.name is None or result.index.name in result.columns:
- drop = False
- return result.reset_index(drop=not drop)
+ return result
# END Manual Partitioning methods
@@ -2863,21 +2947,11 @@ def cat_codes(self):
# END Cat operations
- def has_multiindex(self, axis=0):
- """
- Check if specified axis is indexed by MultiIndex.
-
- Parameters
- ----------
- axis : 0 or 1, default 0
- The axis to check (0 - index, 1 - columns).
-
- Returns
- -------
- bool
- True if index at specified axis is MultiIndex and False otherwise.
- """
- if axis == 0:
- return isinstance(self.index, pandas.MultiIndex)
- assert axis == 1
- return isinstance(self.columns, pandas.MultiIndex)
+ def compare(self, other, **kwargs):
+ return self.__constructor__(
+ self._modin_frame.broadcast_apply_full_axis(
+ 0,
+ lambda l, r: pandas.DataFrame.compare(l, r, **kwargs),
+ other._modin_frame,
+ )
+ )
diff --git a/modin/config/envvars.py b/modin/config/envvars.py
index 167fe746af4..245619273ef 100644
--- a/modin/config/envvars.py
+++ b/modin/config/envvars.py
@@ -12,11 +12,12 @@
# governing permissions and limitations under the License.
import os
+import sys
from textwrap import dedent
import warnings
from packaging import version
-from .pubsub import Parameter, _TYPE_PARAMS
+from .pubsub import Parameter, _TYPE_PARAMS, ExactStr
class EnvironmentVariable(Parameter, type=str, abstract=True):
@@ -112,7 +113,7 @@ class IsRayCluster(EnvironmentVariable, type=bool):
varname = "MODIN_RAY_CLUSTER"
-class RayRedisAddress(EnvironmentVariable, type=str):
+class RayRedisAddress(EnvironmentVariable, type=ExactStr):
"""
What Redis address to connect to when running in Ray cluster
"""
@@ -142,7 +143,19 @@ class Memory(EnvironmentVariable, type=int):
varname = "MODIN_MEMORY"
-class RayPlasmaDir(EnvironmentVariable, type=str):
+class NPartitions(EnvironmentVariable, type=int):
+ """
+ How many partitions to use by default
+ """
+
+ varname = "MODIN_NPARTITIONS"
+
+ @classmethod
+ def _get_default(cls):
+ return CpuCount.get()
+
+
+class RayPlasmaDir(EnvironmentVariable, type=ExactStr):
"""
Path to Plasma storage for Ray
"""
@@ -158,7 +171,7 @@ class IsOutOfCore(EnvironmentVariable, type=bool):
varname = "MODIN_OUT_OF_CORE"
-class SocksProxy(EnvironmentVariable, type=str):
+class SocksProxy(EnvironmentVariable, type=ExactStr):
"""
SOCKS proxy address if it is needed for SSH to work
"""
@@ -196,6 +209,7 @@ class DoUseCalcite(EnvironmentVariable, type=bool):
"""
varname = "MODIN_USE_CALCITE"
+ default = True
class TestDatasetSize(EnvironmentVariable, type=str):
@@ -204,7 +218,30 @@ class TestDatasetSize(EnvironmentVariable, type=str):
"""
varname = "MODIN_TEST_DATASET_SIZE"
- choices = ("small", "normal", "big")
+ choices = ("Small", "Normal", "Big")
+
+
+class TrackFileLeaks(EnvironmentVariable, type=bool):
+ """
+ Whether to track for open file handles leakage during testing
+ """
+
+ varname = "MODIN_TEST_TRACK_FILE_LEAKS"
+ # Turn off tracking on Windows by default because
+ # psutil's open_files() can be extremely slow on Windows (up to adding a few hours).
+ # see https://github.com/giampaolo/psutil/pull/597
+ default = sys.platform != "win32"
+
+
+class AsvImplementation(EnvironmentVariable, type=ExactStr):
+ """
+ Allows to select a library that we will use for testing performance.
+ """
+
+ varname = "MODIN_ASV_USE_IMPL"
+ choices = ("modin", "pandas")
+
+ default = "modin"
def _check_vars():
@@ -219,7 +256,7 @@ def _check_vars():
and issubclass(obj, EnvironmentVariable)
and not obj.is_abstract
}
- found_names = {name for name in os.environ.keys() if name.startswith("MODIN_")}
+ found_names = {name for name in os.environ if name.startswith("MODIN_")}
unknown = found_names - valid_names
if unknown:
warnings.warn(
diff --git a/modin/config/pubsub.py b/modin/config/pubsub.py
index 77aa1cc19b1..0196a3f7f13 100644
--- a/modin/config/pubsub.py
+++ b/modin/config/pubsub.py
@@ -22,11 +22,23 @@ class TypeDescriptor(typing.NamedTuple):
help: str
+class ExactStr(str):
+ """
+ To be used in type params where no transformations are needed
+ """
+
+
_TYPE_PARAMS = {
str: TypeDescriptor(
decode=lambda value: value.strip().title(),
normalize=lambda value: value.strip().title(),
verify=lambda value: True,
+ help="a case-insensitive string",
+ ),
+ ExactStr: TypeDescriptor(
+ decode=lambda value: value,
+ normalize=lambda value: value,
+ verify=lambda value: True,
help="a string",
),
bool: TypeDescriptor(
diff --git a/modin/config/test/test_envvars.py b/modin/config/test/test_envvars.py
index bf4ce04a9aa..a40ae4a0497 100644
--- a/modin/config/test/test_envvars.py
+++ b/modin/config/test/test_envvars.py
@@ -14,7 +14,7 @@
import os
import pytest
-from modin.config.envvars import EnvironmentVariable, _check_vars
+from modin.config.envvars import EnvironmentVariable, _check_vars, ExactStr
@pytest.fixture
@@ -25,9 +25,9 @@ def make_unknown_env():
del os.environ[varname]
-@pytest.fixture
-def make_custom_envvar():
- class CustomVar(EnvironmentVariable, type=str):
+@pytest.fixture(params=[str, ExactStr])
+def make_custom_envvar(request):
+ class CustomVar(EnvironmentVariable, type=request.param):
""" custom var """
default = 10
@@ -40,7 +40,7 @@ class CustomVar(EnvironmentVariable, type=str):
@pytest.fixture
def set_custom_envvar(make_custom_envvar):
os.environ[make_custom_envvar.varname] = " custom "
- yield "Custom"
+ yield "Custom" if make_custom_envvar.type is str else " custom "
del os.environ[make_custom_envvar.varname]
diff --git a/modin/conftest.py b/modin/conftest.py
index 86c0ed252f0..4280a9725ec 100644
--- a/modin/conftest.py
+++ b/modin/conftest.py
@@ -14,11 +14,32 @@
import os
import sys
import pytest
+import pandas
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import shutil
import modin
import modin.config
from modin.config import IsExperimental
+from modin.backends import PandasQueryCompiler, BaseQueryCompiler
+from modin.engines.python.pandas_on_python.io import PandasOnPythonIO
+from modin.data_management.factories import factories
+from modin.utils import get_current_backend
+from modin.pandas.test.utils import (
+ _make_csv_file,
+ get_unique_filename,
+ teardown_test_files,
+ NROWS,
+ IO_OPS_DATA_DIR,
+)
+
+# create test data dir if it is not exists yet
+if not os.path.exists(IO_OPS_DATA_DIR):
+ os.mkdir(IO_OPS_DATA_DIR)
+
def pytest_addoption(parser):
parser.addoption(
@@ -27,6 +48,12 @@ def pytest_addoption(parser):
default="off",
help="simulate cloud for testing: off|normal|experimental",
)
+ parser.addoption(
+ "--backend",
+ action="store",
+ default=None,
+ help="specifies backend to run tests on",
+ )
class Patcher:
@@ -143,6 +170,246 @@ def __contains__(self, name):
def __getattr__(self, name):
return getattr(orig_env, name)
+ def __iter__(self):
+ return iter(orig_env)
+
os.environ = PatchedEnv()
yield
os.environ = orig_env
+
+
+BASE_BACKEND_NAME = "BaseOnPython"
+
+
+class TestQC(BaseQueryCompiler):
+ def __init__(self, modin_frame):
+ self._modin_frame = modin_frame
+
+ @classmethod
+ def from_pandas(cls, df, data_cls):
+ return cls(data_cls.from_pandas(df))
+
+ @classmethod
+ def from_arrow(cls, at, data_cls):
+ return cls(data_cls.from_arrow(at))
+
+ def free(self):
+ pass
+
+ to_pandas = PandasQueryCompiler.to_pandas
+ default_to_pandas = PandasQueryCompiler.default_to_pandas
+
+
+class BaseOnPythonIO(PandasOnPythonIO):
+ query_compiler_cls = TestQC
+
+
+class BaseOnPythonFactory(factories.BaseFactory):
+ @classmethod
+ def prepare(cls):
+ cls.io_cls = BaseOnPythonIO
+
+
+def set_base_backend(name=BASE_BACKEND_NAME):
+ setattr(factories, f"{name}Factory", BaseOnPythonFactory)
+ modin.set_backends(engine="python", partition=name.split("On")[0])
+
+
+def pytest_configure(config):
+ backend = config.option.backend
+
+ if backend is None:
+ return
+
+ if backend == BASE_BACKEND_NAME:
+ set_base_backend(BASE_BACKEND_NAME)
+ else:
+ partition, engine = backend.split("On")
+ modin.set_backends(engine=engine, partition=partition)
+
+
+def pytest_runtest_call(item):
+ custom_markers = ["xfail", "skip"]
+
+ # dynamicly adding custom markers to tests
+ for custom_marker in custom_markers:
+ for marker in item.iter_markers(name=f"{custom_marker}_backends"):
+ backends = marker.args[0]
+ if not isinstance(backends, list):
+ backends = [backends]
+
+ current_backend = get_current_backend()
+ reason = marker.kwargs.pop("reason", "")
+
+ item.add_marker(
+ getattr(pytest.mark, custom_marker)(
+ condition=current_backend in backends,
+ reason=f"Backend {current_backend} does not pass this test. {reason}",
+ **marker.kwargs,
+ )
+ )
+
+
+@pytest.fixture(scope="class")
+def TestReadCSVFixture():
+ filenames = []
+ files_ids = [
+ "test_read_csv_regular",
+ "test_read_csv_blank_lines",
+ "test_read_csv_yes_no",
+ "test_read_csv_nans",
+ "test_read_csv_bad_lines",
+ ]
+ # each xdist worker spawned in separate process with separate namespace and dataset
+ pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids}
+ # test_read_csv_col_handling, test_read_csv_parsing
+ _make_csv_file(filenames)(
+ filename=pytest.csvs_names["test_read_csv_regular"],
+ )
+ # test_read_csv_parsing
+ _make_csv_file(filenames)(
+ filename=pytest.csvs_names["test_read_csv_yes_no"],
+ additional_col_values=["Yes", "true", "No", "false"],
+ )
+ # test_read_csv_col_handling
+ _make_csv_file(filenames)(
+ filename=pytest.csvs_names["test_read_csv_blank_lines"],
+ add_blank_lines=True,
+ )
+ # test_read_csv_nans_handling
+ _make_csv_file(filenames)(
+ filename=pytest.csvs_names["test_read_csv_nans"],
+ add_blank_lines=True,
+ additional_col_values=["", "N/A", "NA", "NULL", "custom_nan", "73"],
+ )
+ # test_read_csv_error_handling
+ _make_csv_file(filenames)(
+ filename=pytest.csvs_names["test_read_csv_bad_lines"],
+ add_bad_lines=True,
+ )
+
+ yield
+ # Delete csv files that were created
+ teardown_test_files(filenames)
+
+
+@pytest.fixture
+def make_csv_file():
+ """Pytest fixture factory that makes temp csv files for testing.
+ Yields:
+ Function that generates csv files
+ """
+ filenames = []
+
+ yield _make_csv_file(filenames)
+
+ # Delete csv files that were created
+ teardown_test_files(filenames)
+
+
+@pytest.fixture
+def make_parquet_file():
+ """Pytest fixture factory that makes a parquet file/dir for testing.
+
+ Yields:
+ Function that generates a parquet file/dir
+ """
+ filenames = []
+
+ def _make_parquet_file(
+ filename,
+ row_size=NROWS,
+ force=True,
+ directory=False,
+ partitioned_columns=[],
+ ):
+ """Helper function to generate parquet files/directories.
+
+ Args:
+ filename: The name of test file, that should be created.
+ row_size: Number of rows for the dataframe.
+ force: Create a new file/directory even if one already exists.
+ directory: Create a partitioned directory using pyarrow.
+ partitioned_columns: Create a partitioned directory using pandas.
+ Will be ignored if directory=True.
+ """
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
+ if os.path.exists(filename) and not force:
+ pass
+ elif directory:
+ if os.path.exists(filename):
+ shutil.rmtree(filename)
+ else:
+ os.mkdir(filename)
+ table = pa.Table.from_pandas(df)
+ pq.write_to_dataset(table, root_path=filename)
+ elif len(partitioned_columns) > 0:
+ df.to_parquet(filename, partition_cols=partitioned_columns)
+ else:
+ df.to_parquet(filename)
+
+ filenames.append(filename)
+
+ # Return function that generates csv files
+ yield _make_parquet_file
+
+ # Delete parquet file that was created
+ for path in filenames:
+ if os.path.exists(path):
+ if os.path.isdir(path):
+ shutil.rmtree(path)
+ else:
+ os.remove(path)
+
+
+@pytest.fixture
+def make_sql_connection():
+ """Sets up sql connections and takes them down after the caller is done.
+
+ Yields:
+ Factory that generates sql connection objects
+ """
+ filenames = []
+
+ def _sql_connection(filename, table=""):
+ # Remove file if exists
+ if os.path.exists(filename):
+ os.remove(filename)
+ filenames.append(filename)
+ # Create connection and, if needed, table
+ conn = "sqlite:///{}".format(filename)
+ if table:
+ df = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3, 4, 5, 6],
+ "col2": [7, 8, 9, 10, 11, 12, 13],
+ "col3": [14, 15, 16, 17, 18, 19, 20],
+ "col4": [21, 22, 23, 24, 25, 26, 27],
+ "col5": [0, 0, 0, 0, 0, 0, 0],
+ }
+ )
+ df.to_sql(table, conn)
+ return conn
+
+ yield _sql_connection
+
+ # Teardown the fixture
+ teardown_test_files(filenames)
+
+
+@pytest.fixture(scope="class")
+def TestReadGlobCSVFixture():
+ filenames = []
+
+ base_name = get_unique_filename(extension="")
+ pytest.glob_path = "{}_*.csv".format(base_name)
+ pytest.files = ["{}_{}.csv".format(base_name, i) for i in range(11)]
+ for fname in pytest.files:
+ # Glob does not guarantee ordering so we have to remove the randomness in the generated csvs.
+ _make_csv_file(filenames)(fname, row_size=11, remove_randomness=True)
+
+ yield
+
+ teardown_test_files(filenames)
diff --git a/modin/data_management/factories/dispatcher.py b/modin/data_management/factories/dispatcher.py
index c54c4a49148..d1a3e9269d2 100644
--- a/modin/data_management/factories/dispatcher.py
+++ b/modin/data_management/factories/dispatcher.py
@@ -103,6 +103,10 @@ def read_parquet(cls, **kwargs):
def read_csv(cls, **kwargs):
return cls.__engine._read_csv(**kwargs)
+ @classmethod
+ def read_csv_glob(cls, **kwargs):
+ return cls.__engine._read_csv_glob(**kwargs)
+
@classmethod
def read_json(cls, **kwargs):
return cls.__engine._read_json(**kwargs)
diff --git a/modin/data_management/factories/factories.py b/modin/data_management/factories/factories.py
index d0420bbc559..63e5adf127e 100644
--- a/modin/data_management/factories/factories.py
+++ b/modin/data_management/factories/factories.py
@@ -231,6 +231,10 @@ def prepare(cls):
cls.io_cls = ExperimentalPandasOnRayIO
+ @classmethod
+ def _read_csv_glob(cls, **kwargs):
+ return cls.io_cls.read_csv_glob(**kwargs)
+
class ExperimentalPandasOnPythonFactory(ExperimentalBaseFactory, PandasOnPythonFactory):
pass
diff --git a/modin/data_management/functions/__init__.py b/modin/data_management/functions/__init__.py
index ec78d509317..e5e58ea3bdf 100644
--- a/modin/data_management/functions/__init__.py
+++ b/modin/data_management/functions/__init__.py
@@ -11,18 +11,21 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
+from .function import Function
from .mapfunction import MapFunction
from .mapreducefunction import MapReduceFunction
from .reductionfunction import ReductionFunction
from .foldfunction import FoldFunction
from .binary_function import BinaryFunction
-from .groupby_function import GroupbyReduceFunction
+from .groupby_function import GroupbyReduceFunction, groupby_reduce_functions
__all__ = [
+ "Function",
"MapFunction",
"MapReduceFunction",
"ReductionFunction",
"FoldFunction",
"BinaryFunction",
"GroupbyReduceFunction",
+ "groupby_reduce_functions",
]
diff --git a/modin/data_management/functions/binary_function.py b/modin/data_management/functions/binary_function.py
index b8edf8f608a..59f278e0070 100644
--- a/modin/data_management/functions/binary_function.py
+++ b/modin/data_management/functions/binary_function.py
@@ -23,6 +23,7 @@ def call(cls, func, *call_args, **call_kwds):
def caller(query_compiler, other, *args, **kwargs):
axis = kwargs.get("axis", 0)
broadcast = kwargs.pop("broadcast", False)
+ join_type = call_kwds.get("join_type", "outer")
if isinstance(other, type(query_compiler)):
if broadcast:
assert (
@@ -39,11 +40,11 @@ def caller(query_compiler, other, *args, **kwargs):
axis,
lambda l, r: func(l, r.squeeze(), *args, **kwargs),
other._modin_frame,
+ join_type=join_type,
preserve_labels=call_kwds.get("preserve_labels", False),
)
)
else:
- join_type = call_kwds.get("join_type", "outer")
return query_compiler.__constructor__(
query_compiler._modin_frame._binary_op(
lambda x, y: func(x, y, *args, **kwargs),
@@ -53,12 +54,11 @@ def caller(query_compiler, other, *args, **kwargs):
)
else:
if isinstance(other, (list, np.ndarray, pandas.Series)):
- new_columns = query_compiler.columns
new_modin_frame = query_compiler._modin_frame._apply_full_axis(
axis,
lambda df: func(df, other, *args, **kwargs),
new_index=query_compiler.index,
- new_columns=new_columns,
+ new_columns=query_compiler.columns,
)
else:
new_modin_frame = query_compiler._modin_frame._map(
diff --git a/modin/data_management/functions/default_methods/binary_default.py b/modin/data_management/functions/default_methods/binary_default.py
index 201e43af045..0fe4134d398 100644
--- a/modin/data_management/functions/default_methods/binary_default.py
+++ b/modin/data_management/functions/default_methods/binary_default.py
@@ -19,7 +19,7 @@
class BinaryDefault(AnyDefault):
@classmethod
- def build_default_to_pandas(cls, fn):
+ def build_default_to_pandas(cls, fn, fn_name):
def bin_ops_wrapper(df, other, *args, **kwargs):
squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop(
"squeeze_other", False
@@ -41,7 +41,4 @@ def bin_ops_wrapper(df, other, *args, **kwargs):
result = pandas.DataFrame(result)
return result
- def wrapper(self, *args, **kwargs):
- return self.default_to_pandas(bin_ops_wrapper, *args, **kwargs)
-
- return wrapper
+ return super().build_default_to_pandas(bin_ops_wrapper, fn_name)
diff --git a/modin/data_management/functions/default_methods/default.py b/modin/data_management/functions/default_methods/default.py
index cf1bb739261..96d1d0ac69d 100644
--- a/modin/data_management/functions/default_methods/default.py
+++ b/modin/data_management/functions/default_methods/default.py
@@ -19,10 +19,13 @@
class DefaultMethod(Function):
+ OBJECT_TYPE = "DataFrame"
+
@classmethod
def call(cls, func, **call_kwds):
obj = call_kwds.get("obj_type", pandas.DataFrame)
force_inplace = call_kwds.get("inplace")
+ fn_name = call_kwds.get("fn_name", getattr(func, "__name__", str(func)))
if isinstance(func, str):
fn = getattr(obj, func)
@@ -57,28 +60,21 @@ def applyier(df, *args, **kwargs):
inplace = force_inplace
return result if not inplace else df
- return cls.build_wrapper(applyier, func)
+ return cls.build_wrapper(applyier, fn_name)
@classmethod
def register(cls, func, **kwargs):
return cls.call(func, **kwargs)
@classmethod
- def build_wrapper(cls, fn, fn_name=None):
- wrapper = cls.build_default_to_pandas(fn)
+ def build_wrapper(cls, fn, fn_name):
+ wrapper = cls.build_default_to_pandas(fn, fn_name)
def args_cast(self, *args, **kwargs):
args = try_cast_to_pandas(args)
kwargs = try_cast_to_pandas(kwargs)
return wrapper(self, *args, **kwargs)
- if fn_name is None:
- fn_name = fn.__name__
- if not isinstance(fn_name, str):
- fn_name = getattr(fn_name, "__name__", repr(fn_name))
-
- # setting proper function name that will be printed in default to pandas warning
- args_cast.__name__ = fn_name
return args_cast
@classmethod
@@ -89,7 +85,9 @@ def property_wrapper(df):
return property_wrapper
@classmethod
- def build_default_to_pandas(cls, fn):
+ def build_default_to_pandas(cls, fn, fn_name):
+ fn.__name__ = f""
+
def wrapper(self, *args, **kwargs):
return self.default_to_pandas(fn, *args, **kwargs)
diff --git a/modin/data_management/functions/default_methods/groupby_default.py b/modin/data_management/functions/default_methods/groupby_default.py
index 6be7089fd25..4a01bf92133 100644
--- a/modin/data_management/functions/default_methods/groupby_default.py
+++ b/modin/data_management/functions/default_methods/groupby_default.py
@@ -27,6 +27,8 @@ class GroupBy:
@classmethod
def validate_by(cls, by):
def try_cast_series(df):
+ if isinstance(df, pandas.DataFrame):
+ df = df.squeeze(axis=1)
if not isinstance(df, pandas.Series):
return df
if df.name == "__reduced__":
@@ -61,22 +63,27 @@ def get_func(cls, grp, key, **kwargs):
@classmethod
def build_aggregate_method(cls, key):
- def fn(df, by, groupby_args, agg_args, axis=0, drop=False, **kwargs):
+ def fn(
+ df,
+ by,
+ groupby_args,
+ agg_args,
+ axis=0,
+ is_multi_by=None,
+ drop=False,
+ **kwargs
+ ):
by = cls.validate_by(by)
- groupby_args = groupby_args.copy()
- as_index = groupby_args.pop("as_index", True)
- groupby_args["as_index"] = True
grp = df.groupby(by, axis=axis, **groupby_args)
agg_func = cls.get_func(grp, key, **kwargs)
- result = agg_func(grp, **agg_args)
+ result = (
+ grp.agg(agg_func, **agg_args)
+ if isinstance(agg_func, dict)
+ else agg_func(grp, **agg_args)
+ )
- if as_index:
- return result
- else:
- if result.index.name is None or result.index.name in result.columns:
- drop = False
- return result.reset_index(drop=not drop)
+ return result
return fn
@@ -93,6 +100,7 @@ def fn(
**kwargs
):
if not isinstance(by, (pandas.Series, pandas.DataFrame)):
+ by = cls.validate_by(by)
return agg_func(
df.groupby(by=by, axis=axis, **groupby_args), **map_args
)
@@ -119,11 +127,16 @@ def fn(
grp = df.groupby(by, axis=axis, **groupby_args)
result = agg_func(grp, **map_args)
+ if isinstance(result, pandas.Series):
+ result = result.to_frame()
+
if not as_index:
if (
len(result.index.names) == 1 and result.index.names[0] is None
) or all([name in result.columns for name in result.index.names]):
drop = False
+ elif kwargs.get("method") == "size":
+ drop = True
result = result.reset_index(drop=not drop)
if result.index.name == "__reduced__":
@@ -145,6 +158,8 @@ def build_groupby(cls, func):
class GroupByDefault(DefaultMethod):
+ OBJECT_TYPE = "GroupBy"
+
@classmethod
def register(cls, func, **kwargs):
- return cls.call(GroupBy.build_groupby(func), **kwargs)
+ return cls.call(GroupBy.build_groupby(func), fn_name=func.__name__, **kwargs)
diff --git a/modin/data_management/functions/default_methods/resample_default.py b/modin/data_management/functions/default_methods/resample_default.py
index dc63aecaef9..f097b9ed277 100644
--- a/modin/data_management/functions/default_methods/resample_default.py
+++ b/modin/data_management/functions/default_methods/resample_default.py
@@ -31,6 +31,12 @@ def fn(df, resample_args, *args, **kwargs):
class ResampleDefault(DefaultMethod):
+ OBJECT_TYPE = "Resampler"
+
@classmethod
def register(cls, func, squeeze_self=False, **kwargs):
- return cls.call(Resampler.build_resample(func, squeeze_self), **kwargs)
+ return cls.call(
+ Resampler.build_resample(func, squeeze_self),
+ fn_name=func.__name__,
+ **kwargs
+ )
diff --git a/modin/data_management/functions/default_methods/rolling_default.py b/modin/data_management/functions/default_methods/rolling_default.py
index d3ee14ed0b4..1cc9377e982 100644
--- a/modin/data_management/functions/default_methods/rolling_default.py
+++ b/modin/data_management/functions/default_methods/rolling_default.py
@@ -29,6 +29,8 @@ def fn(df, rolling_args, *args, **kwargs):
class RollingDefault(DefaultMethod):
+ OBJECT_TYPE = "Rolling"
+
@classmethod
def register(cls, func, **kwargs):
- return cls.call(Rolling.build_rolling(func), **kwargs)
+ return cls.call(Rolling.build_rolling(func), fn_name=func.__name__, **kwargs)
diff --git a/modin/data_management/functions/default_methods/series_default.py b/modin/data_management/functions/default_methods/series_default.py
index 8d0cc952dfe..4d3715821c9 100644
--- a/modin/data_management/functions/default_methods/series_default.py
+++ b/modin/data_management/functions/default_methods/series_default.py
@@ -15,6 +15,8 @@
class SeriesDefault(AnyDefault):
+ OBJECT_TYPE = "Series"
+
@classmethod
def frame_wrapper(cls, df):
return df.squeeze(axis=1)
diff --git a/modin/data_management/functions/foldfunction.py b/modin/data_management/functions/foldfunction.py
index e72e039da04..3deea5591f7 100644
--- a/modin/data_management/functions/foldfunction.py
+++ b/modin/data_management/functions/foldfunction.py
@@ -18,11 +18,10 @@ class FoldFunction(Function):
@classmethod
def call(cls, fold_function, **call_kwds):
def caller(query_compiler, *args, **kwargs):
+ axis = call_kwds.get("axis", kwargs.get("axis"))
return query_compiler.__constructor__(
query_compiler._modin_frame._fold(
- call_kwds.get("axis")
- if "axis" in call_kwds
- else kwargs.get("axis"),
+ cls.validate_axis(axis),
lambda x: fold_function(x, *args, **kwargs),
)
)
diff --git a/modin/data_management/functions/function.py b/modin/data_management/functions/function.py
index 517515f9059..072d725d6ce 100644
--- a/modin/data_management/functions/function.py
+++ b/modin/data_management/functions/function.py
@@ -11,6 +11,8 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
+from typing import Optional
+
class Function(object):
def __init__(self):
@@ -27,3 +29,7 @@ def call(cls, func, **call_kwds):
@classmethod
def register(cls, func, **kwargs):
return cls.call(func, **kwargs)
+
+ @classmethod
+ def validate_axis(cls, axis: Optional[int]) -> int:
+ return 0 if axis is None else axis
diff --git a/modin/data_management/functions/groupby_function.py b/modin/data_management/functions/groupby_function.py
index 6f4890a0e7a..0af75640168 100644
--- a/modin/data_management/functions/groupby_function.py
+++ b/modin/data_management/functions/groupby_function.py
@@ -14,180 +14,263 @@
import pandas
from .mapreducefunction import MapReduceFunction
-from modin.utils import try_cast_to_pandas
+from modin.utils import try_cast_to_pandas, hashable
class GroupbyReduceFunction(MapReduceFunction):
@classmethod
- def call(cls, map_func, reduce_func, *call_args, **call_kwds):
- def caller(
- query_compiler,
- by,
- axis,
- groupby_args,
- map_args,
- reduce_args=None,
- numeric_only=True,
- drop=False,
- ):
- if not isinstance(by, (type(query_compiler), str)):
- by = try_cast_to_pandas(by)
- return query_compiler.default_to_pandas(
- lambda df: map_func(
- df.groupby(by=by, axis=axis, **groupby_args), **map_args
- )
+ def call(cls, map_func, reduce_func=None, **call_kwds):
+ """
+ Build GroupbyReduce function.
+
+ Parameters
+ ----------
+ map_func: str, callable or dict,
+ If 'str' this parameter will be treated as a function name to register,
+ so 'map_func' and 'reduce_func' will be grabbed from 'groupby_reduce_functions'.
+ If dict or callable then this will be treated as a function to apply to each group
+ at the map phase.
+ reduce_func: callable or dict (optional),
+ A function to apply to each group at the reduce phase. If not specified
+ will be set the same as 'map_func'.
+ **call_kwds: kwargs,
+ Kwargs that will be passed to the returned function.
+
+ Returns
+ -------
+ Callable,
+ Function that executes GroupBy aggregation with MapReduce algorithm.
+ """
+ if isinstance(map_func, str):
+
+ def build_fn(name):
+ return lambda df, *args, **kwargs: getattr(df, name)(*args, **kwargs)
+
+ map_func, reduce_func = map(build_fn, groupby_reduce_functions[map_func])
+ if reduce_func is None:
+ reduce_func = map_func
+ assert not (
+ isinstance(map_func, dict) ^ isinstance(reduce_func, dict)
+ ) and not (
+ callable(map_func) ^ callable(reduce_func)
+ ), "Map and reduce functions must be either both dict or both callable."
+
+ return lambda *args, **kwargs: cls.caller(
+ *args, map_func=map_func, reduce_func=reduce_func, **kwargs, **call_kwds
+ )
+
+ @classmethod
+ def map(
+ cls,
+ df,
+ other=None,
+ axis=0,
+ by=None,
+ groupby_args=None,
+ map_func=None,
+ map_args=None,
+ drop=False,
+ ):
+ # Set `as_index` to True to track the metadata of the grouping object
+ # It is used to make sure that between phases we are constructing the
+ # right index and placing columns in the correct order.
+ groupby_args["as_index"] = True
+ groupby_args["observed"] = True
+ if other is not None:
+ # Other is a broadcasted partition that represents 'by' columns
+ # Concatenate it with 'df' to group on its columns names
+ other = other.squeeze(axis=axis ^ 1)
+ if isinstance(other, pandas.DataFrame):
+ df = pandas.concat(
+ [df] + [other[[o for o in other if o not in df]]],
+ axis=1,
)
- assert axis == 0, "Can only groupby reduce with axis=0"
+ other = list(other.columns)
+ by_part = other
+ else:
+ by_part = by
+
+ apply_func = cls.try_filter_dict(map_func, df)
+ result = apply_func(
+ df.groupby(by=by_part, axis=axis, **groupby_args), **map_args
+ )
+ # Result could not always be a frame, so wrapping it into DataFrame
+ return pandas.DataFrame(result)
+
+ @classmethod
+ def reduce(
+ cls,
+ df,
+ partition_idx=0,
+ axis=0,
+ groupby_args=None,
+ reduce_func=None,
+ reduce_args=None,
+ drop=False,
+ **kwargs,
+ ):
+ # Wrapping names into an Index should be unnecessary, however
+ # there is a bug in pandas with intersection that forces us to do so:
+ # https://github.com/pandas-dev/pandas/issues/39699
+ by_part = pandas.Index(df.index.names)
+ if drop and len(df.columns.intersection(by_part)) > 0:
+ df.drop(columns=by_part, errors="ignore", inplace=True)
+
+ groupby_args = groupby_args.copy()
+ method = kwargs.get("method", None)
+ as_index = groupby_args["as_index"]
+
+ # Set `as_index` to True to track the metadata of the grouping object
+ groupby_args["as_index"] = True
+
+ # since now index levels contain out 'by', in the reduce phace
+ # we want to group on these levels
+ groupby_args["level"] = list(range(len(df.index.names)))
+
+ apply_func = cls.try_filter_dict(reduce_func, df)
+ result = apply_func(df.groupby(axis=axis, **groupby_args), **reduce_args)
+
+ if not as_index:
+ insert_levels = partition_idx == 0 and (drop or method == "size")
+ result.reset_index(drop=not insert_levels, inplace=True)
+ # Result could not always be a frame, so wrapping it into DataFrame
+ return pandas.DataFrame(result)
- if numeric_only:
- qc = query_compiler.getitem_column_array(
- query_compiler._modin_frame._numeric_columns(True)
+ @classmethod
+ def caller(
+ cls,
+ query_compiler,
+ by,
+ axis,
+ groupby_args,
+ map_args,
+ map_func,
+ numeric_only=True,
+ **kwargs,
+ ):
+ if not (isinstance(by, (type(query_compiler)) or hashable(by))) or isinstance(
+ by, pandas.Grouper
+ ):
+ by = try_cast_to_pandas(by, squeeze=True)
+ default_func = (
+ (lambda grp: grp.agg(map_func))
+ if isinstance(map_func, dict)
+ else map_func
+ )
+ return query_compiler.default_to_pandas(
+ lambda df: default_func(
+ df.groupby(by=by, axis=axis, **groupby_args), **map_args
)
- else:
- qc = query_compiler
- # since we're going to modify `groupby_args` dict in a `compute_map`,
- # we want to copy it to not propagate these changes into source dict, in case
- # of unsuccessful end of function
- groupby_args = groupby_args.copy()
-
- as_index = groupby_args.get("as_index", True)
- observed = groupby_args.get("observed", False)
-
- if isinstance(by, str):
-
- def _map(df):
- # Set `as_index` to True to track the metadata of the grouping
- # object It is used to make sure that between phases we are
- # constructing the right index and placing columns in the correct
- # order.
- groupby_args["as_index"] = True
- groupby_args["observed"] = True
-
- result = map_func(
- df.groupby(by=by, axis=axis, **groupby_args), **map_args
- )
- # The _modin_groupby_ prefix indicates that this is the first
- # partition, and since we may need to insert the grouping data in
- # the reduce phase
- if (
- not isinstance(result.index, pandas.MultiIndex)
- and result.index.name is not None
- and result.index.name in result.columns
- ):
- result.index.name = "{}{}".format(
- "_modin_groupby_", result.index.name
- )
- return result
-
- else:
-
- def _map(df, other):
- def compute_map(df, other):
- # Set `as_index` to True to track the metadata of the grouping object
- # It is used to make sure that between phases we are constructing the
- # right index and placing columns in the correct order.
- groupby_args["as_index"] = True
- groupby_args["observed"] = True
-
- other = other.squeeze(axis=axis ^ 1)
- if isinstance(other, pandas.DataFrame):
- df = pandas.concat(
- [df] + [other[[o for o in other if o not in df]]],
- axis=1,
- )
- other = list(other.columns)
- result = map_func(
- df.groupby(by=other, axis=axis, **groupby_args), **map_args
- )
- # if `other` has category dtype, then pandas will drop that
- # column after groupby, inserting it back to correctly process
- # reduce phase
- if (
- drop
- and not as_index
- and isinstance(other, pandas.Series)
- and isinstance(other.dtype, pandas.CategoricalDtype)
- and result.index.name is not None
- and result.index.name not in result.columns
- ):
- result.insert(
- loc=0, column=result.index.name, value=result.index
- )
- # The _modin_groupby_ prefix indicates that this is the first partition,
- # and since we may need to insert the grouping data in the reduce phase
- if (
- not isinstance(result.index, pandas.MultiIndex)
- and result.index.name is not None
- and result.index.name in result.columns
- ):
- result.index.name = "{}{}".format(
- "_modin_groupby_", result.index.name
- )
- return result
-
- try:
- return compute_map(df, other)
- # This will happen with Arrow buffer read-only errors. We don't want to copy
- # all the time, so this will try to fast-path the code first.
- except ValueError:
- return compute_map(df.copy(), other.copy())
-
- def _reduce(df):
- def compute_reduce(df):
- other_len = len(df.index.names)
- df = df.reset_index(drop=False)
- # See note above about setting `as_index`
- groupby_args["as_index"] = as_index
- groupby_args["observed"] = observed
- if other_len > 1:
- by_part = list(df.columns[0:other_len])
- else:
- by_part = df.columns[0]
- result = reduce_func(
- df.groupby(by=by_part, axis=axis, **groupby_args), **reduce_args
- )
- if (
- not isinstance(result.index, pandas.MultiIndex)
- and result.index.name is not None
- and "_modin_groupby_" in result.index.name
- ):
- result.index.name = result.index.name[len("_modin_groupby_") :]
- if isinstance(by_part, str) and by_part in result.columns:
- if "_modin_groupby_" in by_part and drop:
- col_name = by_part[len("_modin_groupby_") :]
- new_result = result.drop(columns=col_name, errors="ignore")
- new_result.columns = [
- col_name if "_modin_groupby_" in c else c
- for c in new_result.columns
- ]
- return new_result
- else:
- return (
- result.drop(columns=by_part)
- if call_kwds.get("method", None) != "size"
- else result
- )
- return result
-
- try:
- return compute_reduce(df)
- # This will happen with Arrow buffer read-only errors. We don't want to copy
- # all the time, so this will try to fast-path the code first.
- except ValueError:
- return compute_reduce(df.copy())
-
- # TODO: try to precompute `new_index` and `new_columns`
- if isinstance(by, str):
- new_modin_frame = qc._modin_frame._map_reduce(
- axis, _map, reduce_func=_reduce, preserve_index=False
+ )
+ assert axis == 0, "Can only groupby reduce with axis=0"
+
+ if numeric_only:
+ qc = query_compiler.getitem_column_array(
+ query_compiler._modin_frame._numeric_columns(True)
+ )
+ else:
+ qc = query_compiler
+
+ map_fn, reduce_fn = cls.build_map_reduce_functions(
+ by=by,
+ axis=axis,
+ groupby_args=groupby_args,
+ map_func=map_func,
+ map_args=map_args,
+ **kwargs,
+ )
+
+ broadcastable_by = getattr(by, "_modin_frame", None)
+ apply_indices = list(map_func.keys()) if isinstance(map_func, dict) else None
+ new_modin_frame = qc._modin_frame.groupby_reduce(
+ axis, broadcastable_by, map_fn, reduce_fn, apply_indices=apply_indices
+ )
+
+ result = query_compiler.__constructor__(new_modin_frame)
+ if result.index.name == "__reduced__":
+ result.index.name = None
+ return result
+
+ @staticmethod
+ def try_filter_dict(agg_func, df):
+ if not isinstance(agg_func, dict):
+ return agg_func
+ partition_dict = {k: v for k, v in agg_func.items() if k in df.columns}
+ return lambda grp: grp.agg(partition_dict)
+
+ @classmethod
+ def build_map_reduce_functions(
+ cls,
+ by,
+ axis,
+ groupby_args,
+ map_func,
+ map_args,
+ reduce_func,
+ reduce_args,
+ drop,
+ **kwargs,
+ ):
+ # if by is a query compiler, then it will be broadcasted explicit via
+ # groupby_reduce method of the modin frame and so we don't want secondary
+ # implicit broadcastion via passing it as an function argument.
+ if hasattr(by, "_modin_frame"):
+ by = None
+
+ def _map(df, other=None, **kwargs):
+ def wrapper(df, other=None):
+ return cls.map(
+ df,
+ other,
+ axis=axis,
+ by=by,
+ groupby_args=groupby_args.copy(),
+ map_func=map_func,
+ map_args=map_args,
+ drop=drop,
+ **kwargs,
)
- else:
- new_modin_frame = qc._modin_frame.groupby_reduce(
- axis, by._modin_frame, _map, _reduce
+
+ try:
+ result = wrapper(df, other)
+ # This will happen with Arrow buffer read-only errors. We don't want to copy
+ # all the time, so this will try to fast-path the code first.
+ except ValueError:
+ result = wrapper(df.copy(), other if other is None else other.copy())
+ return result
+
+ def _reduce(df, **call_kwargs):
+ def wrapper(df):
+ return cls.reduce(
+ df,
+ axis=axis,
+ groupby_args=groupby_args,
+ reduce_func=reduce_func,
+ reduce_args=reduce_args,
+ drop=drop,
+ **kwargs,
+ **call_kwargs,
)
- result = query_compiler.__constructor__(new_modin_frame)
- if result.index.name == "__reduced__":
- result.index.name = None
+
+ try:
+ result = wrapper(df)
+ # This will happen with Arrow buffer read-only errors. We don't want to copy
+ # all the time, so this will try to fast-path the code first.
+ except ValueError:
+ result = wrapper(df.copy())
return result
- return caller
+ return _map, _reduce
+
+
+# This dict is a map for function names and their equivalents in MapReduce
+groupby_reduce_functions = {
+ "all": ("all", "all"),
+ "any": ("any", "any"),
+ "count": ("count", "sum"),
+ "max": ("max", "max"),
+ "min": ("min", "min"),
+ "prod": ("prod", "prod"),
+ "size": ("size", "sum"),
+ "sum": ("sum", "sum"),
+}
diff --git a/modin/data_management/functions/mapreducefunction.py b/modin/data_management/functions/mapreducefunction.py
index aace46679ea..e76426d4f7d 100644
--- a/modin/data_management/functions/mapreducefunction.py
+++ b/modin/data_management/functions/mapreducefunction.py
@@ -19,11 +19,10 @@ class MapReduceFunction(Function):
def call(cls, map_function, reduce_function, **call_kwds):
def caller(query_compiler, *args, **kwargs):
preserve_index = call_kwds.pop("preserve_index", True)
+ axis = call_kwds.get("axis", kwargs.get("axis"))
return query_compiler.__constructor__(
query_compiler._modin_frame._map_reduce(
- call_kwds.get("axis")
- if "axis" in call_kwds
- else kwargs.get("axis"),
+ cls.validate_axis(axis),
lambda x: map_function(x, *args, **kwargs),
lambda y: reduce_function(y, *args, **kwargs),
preserve_index=preserve_index,
@@ -33,5 +32,7 @@ def caller(query_compiler, *args, **kwargs):
return caller
@classmethod
- def register(cls, map_function, reduce_function, **kwargs):
+ def register(cls, map_function, reduce_function=None, **kwargs):
+ if reduce_function is None:
+ reduce_function = map_function
return cls.call(map_function, reduce_function, **kwargs)
diff --git a/modin/data_management/functions/reductionfunction.py b/modin/data_management/functions/reductionfunction.py
index ae6e918a659..0b44f88c911 100644
--- a/modin/data_management/functions/reductionfunction.py
+++ b/modin/data_management/functions/reductionfunction.py
@@ -18,12 +18,13 @@ class ReductionFunction(Function):
@classmethod
def call(cls, reduction_function, **call_kwds):
def caller(query_compiler, *args, **kwargs):
+ preserve_index = call_kwds.pop("preserve_index", True)
+ axis = call_kwds.get("axis", kwargs.get("axis"))
return query_compiler.__constructor__(
query_compiler._modin_frame._fold_reduce(
- call_kwds.get("axis")
- if "axis" in call_kwds
- else kwargs.get("axis"),
+ cls.validate_axis(axis),
lambda x: reduction_function(x, *args, **kwargs),
+ preserve_index=preserve_index,
)
)
diff --git a/modin/data_management/utils.py b/modin/data_management/utils.py
index 0d0a4aafa18..8a4058beb3d 100644
--- a/modin/data_management/utils.py
+++ b/modin/data_management/utils.py
@@ -82,18 +82,19 @@ def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None)
list
A list of Pandas DataFrames.
"""
- if num_splits == 1:
- return result
if length_list is not None:
length_list.insert(0, 0)
sums = np.cumsum(length_list)
- if axis == 0:
+ if axis == 0 or isinstance(result, pandas.Series):
return [result.iloc[sums[i] : sums[i + 1]] for i in range(len(sums) - 1)]
else:
return [result.iloc[:, sums[i] : sums[i + 1]] for i in range(len(sums) - 1)]
+
+ if num_splits == 1:
+ return [result]
# We do this to restore block partitioning
chunksize = compute_chunksize(result, num_splits, axis=axis)
- if axis == 0:
+ if axis == 0 or isinstance(result, pandas.Series):
return [
result.iloc[chunksize * i : chunksize * (i + 1)] for i in range(num_splits)
]
diff --git a/modin/distributed/dataframe/pandas/__init__.py b/modin/distributed/dataframe/pandas/__init__.py
new file mode 100644
index 00000000000..b207f531d81
--- /dev/null
+++ b/modin/distributed/dataframe/pandas/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+from .partitions import unwrap_partitions, from_partitions
+
+__all__ = ["unwrap_partitions", "from_partitions"]
diff --git a/modin/distributed/dataframe/pandas/partitions.py b/modin/distributed/dataframe/pandas/partitions.py
new file mode 100644
index 00000000000..e71522ed22f
--- /dev/null
+++ b/modin/distributed/dataframe/pandas/partitions.py
@@ -0,0 +1,159 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import numpy as np
+
+from modin.pandas.dataframe import DataFrame
+from modin.backends.pandas.query_compiler import PandasQueryCompiler
+
+
+def unwrap_partitions(api_layer_object, axis=None, bind_ip=False):
+ """
+ Unwrap partitions of the `api_layer_object`.
+
+ Parameters
+ ----------
+ api_layer_object : DataFrame or Series
+ The API layer object.
+ axis : None, 0 or 1. Default is None
+ The axis to unwrap partitions for (0 - row partitions, 1 - column partitions).
+ If axis is None, all the partitions of the API layer object are unwrapped.
+ bind_ip : boolean. Default is False
+ Whether to bind node ip address to each partition or not.
+
+ Returns
+ -------
+ list
+ A list of Ray.ObjectRef/Dask.Future to partitions of the `api_layer_object`
+ if Ray/Dask is used as an engine.
+
+ Notes
+ -----
+ In case bind_ip=True, a list containing tuples of Ray.ObjectRef/Dask.Future to node ip addresses
+ and partitions of the `api_layer_object`, respectively, is returned if Ray/Dask is used as an engine.
+ """
+ if not hasattr(api_layer_object, "_query_compiler"):
+ raise ValueError(
+ f"Only API Layer objects may be passed in here, got {type(api_layer_object)} instead."
+ )
+
+ if axis is None:
+
+ def _unwrap_partitions(oid):
+ if bind_ip:
+ return [
+ [(partition.ip, getattr(partition, oid)) for partition in row]
+ for row in api_layer_object._query_compiler._modin_frame._partitions
+ ]
+ else:
+ return [
+ [getattr(partition, oid) for partition in row]
+ for row in api_layer_object._query_compiler._modin_frame._partitions
+ ]
+
+ actual_engine = type(
+ api_layer_object._query_compiler._modin_frame._partitions[0][0]
+ ).__name__
+ if actual_engine in ("PandasOnRayFramePartition",):
+ return _unwrap_partitions("oid")
+ elif actual_engine in ("PandasOnDaskFramePartition",):
+ return _unwrap_partitions("future")
+ raise ValueError(
+ f"Do not know how to unwrap '{actual_engine}' underlying partitions"
+ )
+ else:
+ partitions = (
+ api_layer_object._query_compiler._modin_frame._frame_mgr_cls.axis_partition(
+ api_layer_object._query_compiler._modin_frame._partitions, axis ^ 1
+ )
+ )
+ return [
+ part.coalesce(bind_ip=bind_ip).unwrap(squeeze=True, bind_ip=bind_ip)
+ for part in partitions
+ ]
+
+
+def from_partitions(partitions, axis):
+ """
+ Create DataFrame from remote partitions.
+
+ Parameters
+ ----------
+ partitions : list
+ List of Ray.ObjectRef/Dask.Future referencing to partitions in depend of the engine used.
+ Or list containing tuples of Ray.ObjectRef/Dask.Future referencing to ip addresses of partitions
+ and partitions itself in depend of the engine used.
+ axis : None, 0 or 1
+ The `axis` parameter is used to identify what are the partitions passed.
+ You have to set:
+ - `axis` to 0 if you want to create DataFrame from row partitions.
+ - `axis` to 1 if you want to create DataFrame from column partitions.
+ - `axis` to None if you want to create DataFrame from 2D list of partitions.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame instance created from remote partitions.
+ """
+ from modin.data_management.factories.dispatcher import EngineDispatcher
+
+ factory = EngineDispatcher.get_engine()
+
+ partition_class = factory.io_cls.frame_cls._frame_mgr_cls._partition_class
+ partition_frame_class = factory.io_cls.frame_cls
+ partition_mgr_class = factory.io_cls.frame_cls._frame_mgr_cls
+
+ # Since we store partitions of Modin DataFrame as a 2D NumPy array we need to place
+ # passed partitions to 2D NumPy array to pass it to internal Modin Frame class.
+ # `axis=None` - convert 2D list to 2D NumPy array
+ if axis is None:
+ if isinstance(partitions[0][0], tuple):
+ parts = np.array(
+ [
+ [partition_class(partition, ip=ip) for ip, partition in row]
+ for row in partitions
+ ]
+ )
+ else:
+ parts = np.array(
+ [
+ [partition_class(partition) for partition in row]
+ for row in partitions
+ ]
+ )
+ # `axis=0` - place row partitions to 2D NumPy array so that each row of the array is one row partition.
+ elif axis == 0:
+ if isinstance(partitions[0], tuple):
+ parts = np.array(
+ [[partition_class(partition, ip=ip)] for ip, partition in partitions]
+ )
+ else:
+ parts = np.array([[partition_class(partition)] for partition in partitions])
+ # `axis=1` - place column partitions to 2D NumPy array so that each column of the array is one column partition.
+ elif axis == 1:
+ if isinstance(partitions[0], tuple):
+ parts = np.array(
+ [[partition_class(partition, ip=ip) for ip, partition in partitions]]
+ )
+ else:
+ parts = np.array([[partition_class(partition) for partition in partitions]])
+ else:
+ raise ValueError(
+ f"Got unacceptable value of axis {axis}. Possible values are {0}, {1} or {None}."
+ )
+
+ index = partition_mgr_class.get_indices(0, parts, lambda df: df.axes[0])
+ columns = partition_mgr_class.get_indices(1, parts, lambda df: df.axes[1])
+ return DataFrame(
+ query_compiler=PandasQueryCompiler(partition_frame_class(parts, index, columns))
+ )
diff --git a/modin/engines/base/frame/axis_partition.py b/modin/engines/base/frame/axis_partition.py
index aaacacce875..2c42ee9cc5a 100644
--- a/modin/engines/base/frame/axis_partition.py
+++ b/modin/engines/base/frame/axis_partition.py
@@ -11,16 +11,16 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
+from abc import ABC
import pandas
+import numpy as np
from modin.data_management.utils import split_result_of_axis_func_pandas
-NOT_IMPLMENTED_MESSAGE = "Must be implemented in child class"
+class BaseFrameAxisPartition(ABC): # pragma: no cover
+ """An abstract class that represents the Parent class for any `ColumnPartition` or `RowPartition` class.
-class BaseFrameAxisPartition(object): # pragma: no cover
- """This abstract class represents the Parent class for any
- `ColumnPartition` or `RowPartition` class. This class is intended to
- simplify the way that operations are performed
+ This class is intended to simplify the way that operations are performed.
Note 0: The procedures that use this class and its methods assume that
they have some global knowledge about the entire axis. This may
@@ -46,7 +46,7 @@ def apply(
maintain_partitioning=True,
**kwargs,
):
- """Applies a function to a full axis.
+ """Apply a function to a full axis.
Note: The procedures that invoke this method assume full axis
knowledge. Implement this method accordingly.
@@ -69,10 +69,11 @@ def apply(
orientation (the lengths will remain the same). This is ignored between
two axis partitions.
- Returns:
+ Returns
+ -------
A list of `BaseFramePartition` objects.
"""
- raise NotImplementedError(NOT_IMPLMENTED_MESSAGE)
+ pass
def shuffle(self, func, lengths, **kwargs):
"""Shuffle the order of the data in this axis based on the `lengths`.
@@ -81,26 +82,74 @@ def shuffle(self, func, lengths, **kwargs):
func: The function to apply before splitting.
lengths: The list of partition lengths to split the result into.
- Returns:
+ Returns
+ -------
A list of RemotePartition objects split by `lengths`.
"""
- raise NotImplementedError(NOT_IMPLMENTED_MESSAGE)
+ pass
# Child classes must have these in order to correctly subclass.
instance_type = None
partition_type = None
def _wrap_partitions(self, partitions):
- if isinstance(partitions, self.instance_type):
- return [self.partition_type(partitions)]
+ return [self.partition_type(obj) for obj in partitions]
+
+ def coalesce(self, bind_ip=False):
+ """
+ Coalesce the axis partitions into a single partition.
+
+ Parameters
+ ----------
+ bind_ip : boolean, default False
+ Whether to bind node ip address to a single partition or not.
+
+ Returns
+ -------
+ BaseFrameAxisPartition
+ An axis partition containing only a single coalesced partition.
+ """
+ coalesced = self.apply(lambda x: x, num_splits=1, maintain_partitioning=False)
+ return type(self)(coalesced, bind_ip=bind_ip)
+
+ def unwrap(self, squeeze=False, bind_ip=False):
+ """
+ Unwrap partitions from axis partition.
+
+ Parameters
+ ----------
+ squeeze : boolean, default False
+ The flag used to unwrap only one partition.
+ bind_ip : boolean, default False
+ Whether to bind node ip address to each partition or not.
+
+ Returns
+ -------
+ list
+ List of partitions from axis partition.
+
+ Notes
+ -----
+ In case bind_ip=True, list containing tuples of Ray.ObjectRef/Dask.Future
+ to node ip addresses and unwrapped partitions, respectively, is returned
+ if Ray/Dask is used as an engine.
+ """
+ if squeeze and len(self.list_of_blocks) == 1:
+ if bind_ip:
+ return self.list_of_ips[0], self.list_of_blocks[0]
+ else:
+ return self.list_of_blocks[0]
else:
- return [self.partition_type(obj) for obj in partitions]
+ if bind_ip:
+ return list(zip(self.list_of_ips, self.list_of_blocks))
+ else:
+ return self.list_of_blocks
class PandasFrameAxisPartition(BaseFrameAxisPartition):
- """This abstract class is created to simplify and consolidate the code for
- AxisPartitions that run pandas. Because much of the code is similar, this allows
- us to reuse this code.
+ """An abstract class is created to simplify and consolidate the code for AxisPartitions that run pandas.
+
+ Because much of the code is similar, this allows us to reuse this code.
Subclasses must implement `list_of_blocks` which unwraps the `RemotePartition`
objects and creates something interpretable as a pandas DataFrame.
@@ -118,23 +167,28 @@ def apply(
maintain_partitioning=True,
**kwargs,
):
- """Applies func to the object in the plasma store.
+ """Apply func to the object in the plasma store.
See notes in Parent class about this method.
- Args:
- func: The function to apply.
- num_splits: The number of times to split the result object.
- other_axis_partition: Another `PandasOnRayFrameAxisPartition` object to apply to
- func with this one.
- maintain_partitioning: Whether or not to keep the partitioning in the same
- orientation as it was previously. This is important because we may be
- operating on an individual AxisPartition and not touching the rest.
- In this case, we have to return the partitioning to its previous
- orientation (the lengths will remain the same). This is ignored between
- two axis partitions.
+ Parameters
+ ----------
+ func: callable
+ The function to apply.
+ num_splits: int
+ The number of times to split the result object.
+ other_axis_partition: PandasOnRayFrameAxisPartition object
+ Another `PandasOnRayFrameAxisPartition` object to apply to func with this one.
+ maintain_partitioning: boolean
+ Whether or not to keep the partitioning in the same
+ orientation as it was previously. This is important because we may be
+ operating on an individual AxisPartition and not touching the rest.
+ In this case, we have to return the partitioning to its previous
+ orientation (the lengths will remain the same). This is ignored between
+ two axis partitions.
- Returns:
+ Returns
+ -------
A list of `RayRemotePartition` objects.
"""
if num_splits is None:
@@ -143,12 +197,13 @@ def apply(
if other_axis_partition is not None:
if not isinstance(other_axis_partition, list):
other_axis_partition = [other_axis_partition]
- other_shape = (
- len(other_axis_partition),
- len(other_axis_partition[0].list_of_blocks),
+
+ # (other_shape[i-1], other_shape[i]) will indicate slice
+ # to restore i-1 axis partition
+ other_shape = np.cumsum(
+ [0] + [len(o.list_of_blocks) for o in other_axis_partition]
)
- if not self.axis:
- other_shape = tuple(reversed(other_shape))
+
return self._wrap_partitions(
self.deploy_func_between_two_axis_partitions(
self.axis,
@@ -180,7 +235,8 @@ def shuffle(self, func, lengths, **kwargs):
func: The function to apply before splitting.
lengths: The list of partition lengths to split the result into.
- Returns:
+ Returns
+ -------
A list of RemotePartition objects split by `lengths`.
"""
num_splits = len(lengths)
@@ -207,7 +263,8 @@ def deploy_axis_func(
If False, create a new partition layout.
partitions: All partitions that make up the full axis (row or column)
- Returns:
+ Returns
+ -------
A list of Pandas DataFrames.
"""
# Pop these off first because they aren't expected by the function.
@@ -216,10 +273,6 @@ def deploy_axis_func(
dataframe = pandas.concat(list(partitions), axis=axis, copy=False)
result = func(dataframe, **kwargs)
- if isinstance(result, pandas.Series):
- if num_splits == 1:
- return result
- return [result] + [pandas.Series([]) for _ in range(num_splits - 1)]
if manual_partition:
# The split function is expecting a list
@@ -267,14 +320,14 @@ def deploy_func_between_two_axis_partitions(
rt_parts = partitions[len_of_left:]
- # reshaping flattened `rt_parts` array into with shape `other_shape`
+ # reshaping flattened `rt_parts` array into a frame with shape `other_shape`
combined_axis = [
pandas.concat(
- [rt_parts[other_shape[axis] * i + j] for j in range(other_shape[axis])],
+ rt_parts[other_shape[i - 1] : other_shape[i]],
axis=axis,
copy=False,
)
- for i in range(other_shape[axis ^ 1])
+ for i in range(1, len(other_shape))
]
rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False)
diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py
index e2a8ea0ab57..1e7c963de90 100644
--- a/modin/engines/base/frame/data.py
+++ b/modin/engines/base/frame/data.py
@@ -15,9 +15,8 @@
import numpy as np
import pandas
from pandas.core.indexes.api import ensure_index, Index, RangeIndex
-from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.dtypes.common import is_numeric_dtype
-from typing import Union
+from typing import List, Hashable
from modin.backends.pandas.query_compiler import PandasQueryCompiler
from modin.error_message import ErrorMessage
@@ -25,13 +24,17 @@
class BasePandasFrame(object):
+ """An abstract class that represents the Parent class for any Pandas DataFrame class.
+
+ This class is intended to simplify the way that operations are performed
+ """
_frame_mgr_cls = None
_query_compiler_cls = PandasQueryCompiler
@property
def __constructor__(self):
- """The constructor for this object. A convenience method"""
+ """Create a new instance of this object."""
return type(self)
def __init__(
@@ -42,7 +45,6 @@ def __init__(
row_lengths=None,
column_widths=None,
dtypes=None,
- validate_axes: Union[bool, str] = False,
):
"""Initialize a dataframe.
@@ -56,8 +58,6 @@ def __init__(
column_widths : (optional) The width of each partition in the columns. The
"width" of each of the block partitions. Is computed if not provided.
dtypes : (optional) The data types for the dataframe.
- validate_axes : (optional) Whether or not validate for equality
- internal indices of partitions and passed `index` and `columns`.
"""
self._partitions = partitions
self._index_cache = ensure_index(index)
@@ -79,15 +79,14 @@ def __init__(
)
self._column_widths_cache = column_widths
self._dtypes = dtypes
- if validate_axes is not False:
- self._validate_internal_indices(mode=validate_axes)
self._filter_empties()
@property
def _row_lengths(self):
"""Compute the row lengths if they are not cached.
- Returns:
+ Returns
+ -------
A list of row lengths.
"""
if self._row_lengths_cache is None:
@@ -103,7 +102,8 @@ def _row_lengths(self):
def _column_widths(self):
"""Compute the column widths if they are not cached.
- Returns:
+ Returns
+ -------
A list of column widths.
"""
if self._column_widths_cache is None:
@@ -113,11 +113,17 @@ def _column_widths(self):
self._column_widths_cache = []
return self._column_widths_cache
+ @property
+ def _axes_lengths(self):
+ """Row lengths, column widths that can be accessed with an `axis` integer."""
+ return [self._row_lengths, self._column_widths]
+
@property
def dtypes(self):
"""Compute the data types if they are not cached.
- Returns:
+ Returns
+ -------
A pandas Series containing the data types for this dataframe.
"""
if self._dtypes is None:
@@ -127,7 +133,8 @@ def dtypes(self):
def _compute_dtypes(self):
"""Compute the dtypes via MapReduce.
- Returns:
+ Returns
+ -------
The data types of this dataframe.
"""
@@ -149,13 +156,17 @@ def dtype_builder(df):
_columns_cache = None
def _validate_set_axis(self, new_labels, old_labels):
- """Validates the index or columns replacement against the old labels.
+ """Validate the index or columns replacement against the old labels.
- Args:
- new_labels: The labels to replace with.
- old_labels: The labels to replace.
+ Parameters
+ ----------
+ new_labels: list-like
+ The labels to replace with.
+ old_labels: list-like
+ The labels to replace.
- Returns:
+ Returns
+ -------
The validated labels.
"""
new_labels = ensure_index(new_labels)
@@ -169,26 +180,30 @@ def _validate_set_axis(self, new_labels, old_labels):
return new_labels
def _get_index(self):
- """Gets the index from the cache object.
+ """Get the index from the cache object.
- Returns:
+ Returns
+ -------
A pandas.Index object containing the row labels.
"""
return self._index_cache
def _get_columns(self):
- """Gets the columns from the cache object.
+ """Get the columns from the cache object.
- Returns:
+ Returns
+ -------
A pandas.Index object containing the column labels.
"""
return self._columns_cache
def _set_index(self, new_index):
- """Replaces the current row labels with new labels.
+ """Replace the current row labels with new labels.
- Args:
- new_index: The replacement row labels.
+ Parameters
+ ----------
+ new_index: list-like
+ The replacement row labels.
"""
if self._index_cache is None:
self._index_cache = ensure_index(new_index)
@@ -198,10 +213,12 @@ def _set_index(self, new_index):
self._apply_index_objs(axis=0)
def _set_columns(self, new_columns):
- """Replaces the current column labels with new labels.
+ """Replace the current column labels with new labels.
- Args:
- new_columns: The replacement column labels.
+ Parameters
+ ----------
+ new_columns: list-like
+ The replacement column labels.
"""
if self._columns_cache is None:
self._columns_cache = ensure_index(new_columns)
@@ -212,40 +229,39 @@ def _set_columns(self, new_columns):
self._dtypes.index = new_columns
self._apply_index_objs(axis=1)
- def _set_axis(self, axis, new_axis, cache_only=False):
- """Replaces the current labels at the specified axis with the new one
-
- Parameters
- ----------
- axis : int,
- Axis to set labels along
- new_axis : Index,
- The replacement labels
- cache_only : bool,
- Whether to change only external indices, or propagate it
- into partitions
- """
- if axis:
- if not cache_only:
- self._set_columns(new_axis)
- else:
- self._columns_cache = ensure_index(new_axis)
- else:
- if not cache_only:
- self._set_index(new_axis)
- else:
- self._index_cache = ensure_index(new_axis)
-
columns = property(_get_columns, _set_columns)
index = property(_get_index, _set_index)
@property
def axes(self):
- """The index, columns that can be accessed with an `axis` integer."""
+ """Index, columns that can be accessed with an `axis` integer."""
return [self.index, self.columns]
+ def _compute_axis_labels(self, axis: int, partitions=None):
+ """
+ Compute the labels for specific `axis`.
+
+ Parameters
+ ----------
+ axis: int
+ Axis to compute labels along
+ partitions: numpy 2D array (optional)
+ Partitions from which labels will be grabbed,
+ if no specified, partitions will be considered as `self._partitions`
+
+ Returns
+ -------
+ Pandas.Index
+ Labels for the specified `axis`
+ """
+ if partitions is None:
+ partitions = self._partitions
+ return self._frame_mgr_cls.get_indices(
+ axis, partitions, lambda df: df.axes[axis]
+ )
+
def _filter_empties(self):
- """Removes empty partitions to avoid triggering excess computation."""
+ """Remove empty partitions to avoid triggering excess computation."""
if len(self.axes[0]) == 0 or len(self.axes[1]) == 0:
# This is the case for an empty frame. We don't want to completely remove
# all metadata and partitions so for the moment, we won't prune if the frame
@@ -257,96 +273,14 @@ def _filter_empties(self):
[
self._partitions[i][j]
for j in range(len(self._partitions[i]))
- if j < len(self._column_widths) and self._column_widths[j] > 0
+ if j < len(self._column_widths) and self._column_widths[j] != 0
]
for i in range(len(self._partitions))
- if i < len(self._row_lengths) and self._row_lengths[i] > 0
+ if i < len(self._row_lengths) and self._row_lengths[i] != 0
]
)
- self._column_widths_cache = [w for w in self._column_widths if w > 0]
- self._row_lengths_cache = [r for r in self._row_lengths if r > 0]
-
- def _validate_axis_equality(self, axis: int, force: bool = False):
- """
- Validates internal and external indices of modin_frame at the specified axis.
-
- Parameters
- ----------
- axis : 0 or 1
- Axis to validate indices along (0 - index, 1 - columns).
- force : boolean, default False
- Whether to update external indices with internal if their lengths
- do not match or raise an exception in that case.
- """
- internal_axis = self._frame_mgr_cls.get_indices(
- axis, self._partitions, lambda df: df.axes[axis]
- )
- self_axis = self.axes[axis]
- is_equals = self_axis.equals(internal_axis)
- if (
- isinstance(self_axis, DatetimeIndex)
- and isinstance(internal_axis, DatetimeIndex)
- and is_equals
- ):
- if getattr(self_axis, "freq") != getattr(internal_axis, "freq"):
- is_equals = False
- force = True
- is_lenghts_matches = len(self_axis) == len(internal_axis)
- if not is_equals:
- if not is_lenghts_matches:
- if axis:
- self._column_widths_cache = None
- else:
- self._row_lengths_cache = None
- new_axis = self_axis if is_lenghts_matches and not force else internal_axis
- self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)
-
- def _validate_internal_indices(self, mode=None, **kwargs):
- """
- Validates and optionally updates internal and external indices
- of modin_frame in specified mode. There is 3 modes supported:
- 1. "reduced" - force validates on that axes
- where external indices is ["__reduced__"]
- 2. "all" - validates indices at all axes, optionally force
- if `force` parameter specified in kwargs
- 3. "custom" - validation follows arguments specified in kwargs.
-
- Parameters
- ----------
- mode : str or bool, default None
- validate_index : bool, (optional, could be specified via `mode`)
- validate_columns : bool, (optional, could be specified via `mode`)
- force : bool (optional, could be specified via `mode`)
- Whether to update external indices with internal if their lengths
- do not match or raise an exception in that case.
- """
- if isinstance(mode, bool):
- is_force = mode
- mode = "all"
- else:
- is_force = kwargs.get("force", False)
-
- reduced_sample = pandas.Index(["__reduced__"])
- args_dict = {
- "custom": kwargs,
- "reduced": {
- "validate_index": self.index.equals(reduced_sample),
- "validate_columns": self.columns.equals(reduced_sample),
- "force": False,
- },
- "all": {
- "validate_index": True,
- "validate_columns": True,
- "force": is_force,
- },
- }
-
- args = args_dict.get(mode, args_dict["custom"])
-
- if args.get("validate_index", True):
- self._validate_axis_equality(axis=0, force=args.get("force"))
- if args.get("validate_columns", True):
- self._validate_axis_equality(axis=1, force=args.get("force"))
+ self._column_widths_cache = [w for w in self._column_widths if w != 0]
+ self._row_lengths_cache = [r for r in self._row_lengths if r != 0]
def _apply_index_objs(self, axis=None):
"""Lazily applies the index object (Index or Columns) to the partitions.
@@ -354,7 +288,8 @@ def _apply_index_objs(self, axis=None):
Args:
axis: The axis to apply to, None applies to both axes.
- Returns:
+ Returns
+ -------
A new 2D array of partitions that have the index assignment added to the
call queue.
"""
@@ -563,7 +498,6 @@ def mask(
new_row_lengths,
new_col_widths,
new_dtypes,
- validate_axes="all" if new_partitions.size != 0 else False,
)
# Check if monotonically increasing, return if it is. Fast track code path for
# common case to keep it fast.
@@ -603,6 +537,90 @@ def mask(
row_numeric_idx=new_row_order, col_numeric_idx=new_col_order
)
+ def from_labels(self) -> "BasePandasFrame":
+ """Convert the row labels to a column of data, inserted at the first position.
+
+ Returns
+ -------
+ BasePandasFrame
+ A new BasePandasFrame.
+ """
+ new_row_labels = pandas.RangeIndex(len(self.index))
+ # Column labels are different for multilevel index.
+ if len(self.index.names) > 1:
+ # We will also use the `new_column_names` in the calculation of the internal metadata, so this is a
+ # lightweight way of ensuring the metadata matches.
+ new_column_names = pandas.Index(
+ [
+ self.index.names[i]
+ if self.index.names[i] is not None
+ else "level_{}".format(i)
+ for i in range(len(self.index.names))
+ ]
+ )
+ new_columns = new_column_names.append(self.columns)
+ else:
+ # See note above about usage of `new_column_names`.
+ new_column_names = pandas.Index(
+ [
+ self.index.names[0]
+ if self.index.names[0] is not None
+ else "index"
+ if "index" not in self.columns
+ else "level_{}".format(0)
+ ]
+ )
+ new_columns = new_column_names.append(self.columns)
+
+ def from_labels_executor(df, **kwargs):
+ # Setting the names here ensures that external and internal metadata always match.
+ df.index.names = new_column_names
+ return df.reset_index()
+
+ new_parts = self._frame_mgr_cls.apply_func_to_select_indices(
+ 0,
+ self._partitions,
+ from_labels_executor,
+ [0],
+ keep_remaining=True,
+ )
+ new_column_widths = [
+ len(self.index.names) + self._column_widths[0]
+ ] + self._column_widths[1:]
+ result = self.__constructor__(
+ new_parts,
+ new_row_labels,
+ new_columns,
+ row_lengths=self._row_lengths_cache,
+ column_widths=new_column_widths,
+ )
+ # Propagate the new row labels to the all dataframe partitions
+ result._apply_index_objs(0)
+ return result
+
+ def to_labels(self, column_list: List[Hashable]) -> "BasePandasFrame":
+ """Move one or more columns into the row labels. Previous labels are dropped.
+
+ Parameters
+ ----------
+ column_list : list of hashable
+ The list of column names to place as the new row labels.
+
+ Returns
+ -------
+ A new BasePandasFrame that has the updated labels.
+ """
+ extracted_columns = self.mask(col_indices=column_list).to_pandas()
+ if len(column_list) == 1:
+ new_labels = pandas.Index(extracted_columns.squeeze(axis=1))
+ else:
+ new_labels = pandas.MultiIndex.from_frame(extracted_columns)
+ result = self.mask(
+ col_indices=[i for i in self.columns if i not in column_list]
+ )
+ result.index = new_labels
+ return result
+
def reorder_labels(self, row_numeric_idx=None, col_numeric_idx=None):
"""Reorder the column and or rows in this DataFrame.
@@ -641,7 +659,8 @@ def reorder_labels(self, row_numeric_idx=None, col_numeric_idx=None):
def copy(self):
"""Copy this object.
- Returns:
+ Returns
+ -------
A copied version of this object.
"""
return self.__constructor__(
@@ -655,13 +674,14 @@ def copy(self):
@classmethod
def combine_dtypes(cls, list_of_dtypes, column_names):
- """Describes how data types should be combined when they do not match.
+ """Describe how data types should be combined when they do not match.
Args:
list_of_dtypes: A list of pandas Series with the data types.
column_names: The names of the columns that the data types map to.
- Returns:
+ Returns
+ -------
A pandas Series containing the finalized data types.
"""
# Compute dtypes by getting collecting and combining all of the partitions. The
@@ -677,13 +697,14 @@ def combine_dtypes(cls, list_of_dtypes, column_names):
return dtypes
def astype(self, col_dtypes):
- """Converts columns dtypes to given dtypes.
+ """Convert the columns dtypes to given dtypes.
Args:
col_dtypes: Dictionary of {col: dtype,...} where col is the column
name and dtype is a numpy dtype.
- Returns:
+ Returns
+ -------
dataframe with updated dtypes.
"""
columns = col_dtypes.keys()
@@ -715,9 +736,7 @@ def astype(self, col_dtypes):
def astype_builder(df):
return df.astype({k: v for k, v in col_dtypes.items() if k in df})
- new_frame = self._frame_mgr_cls.lazy_map_partitions(
- self._partitions, astype_builder
- )
+ new_frame = self._frame_mgr_cls.map_partitions(self._partitions, astype_builder)
return self.__constructor__(
new_frame,
self.index,
@@ -735,7 +754,8 @@ def add_prefix(self, prefix, axis):
prefix: The prefix to add.
axis: The axis to update.
- Returns:
+ Returns
+ -------
A new dataframe with the updated labels.
"""
new_labels = self.axes[axis].map(lambda x: str(prefix) + str(x))
@@ -753,7 +773,8 @@ def add_suffix(self, suffix, axis):
suffix: The suffix to add.
axis: The axis to update.
- Returns:
+ Returns
+ -------
A new dataframe with the updated labels.
"""
new_labels = self.axes[axis].map(lambda x: str(x) + str(suffix))
@@ -767,9 +788,10 @@ def add_suffix(self, suffix, axis):
# END Metadata modification methods
def _numeric_columns(self, include_bool=True):
- """Returns the numeric columns of the Manager.
+ """Return the numeric columns of the Manager.
- Returns:
+ Returns
+ -------
List of index names.
"""
columns = []
@@ -904,46 +926,90 @@ def internal(block_idx, global_index):
]
return OrderedDict(partition_ids_with_indices)
- def _join_index_objects(self, axis, other_index, how, sort):
+ @staticmethod
+ def _join_index_objects(axis, indexes, how, sort):
"""
- Joins a pair of index objects (columns or rows) by a given strategy.
+ Join the pair of index objects (columns or rows) by a given strategy.
Unlike Index.join() in Pandas, if axis is 1, the sort is
False, and how is "outer", the result will _not_ be sorted.
Parameters
----------
- axis : 0 or 1
- The axis index object to join (0 - rows, 1 - columns).
- other_index : Index
- The other_index to join on.
- how : {'left', 'right', 'inner', 'outer'}
- The type of join to join to make.
- sort : boolean
- Whether or not to sort the joined index
+ axis : 0 or 1
+ The axis index object to join (0 - rows, 1 - columns).
+ indexes : list(Index)
+ The indexes to join on.
+ how : {'left', 'right', 'inner', 'outer', None}
+ The type of join to join to make. If `None` then joined index
+ considered to be the first index in the `indexes` list.
+ sort : boolean
+ Whether or not to sort the joined index
Returns
-------
- Index
- Joined indices.
+ (Index, func)
+ Joined index with make_reindexer func
"""
+ assert isinstance(indexes, list)
- def merge_index(obj1, obj2):
+ # define helper functions
+ def merge(left_index, right_index):
if axis == 1 and how == "outer" and not sort:
- return obj1.union(obj2, sort=False)
+ return left_index.union(right_index, sort=False)
+ else:
+ return left_index.join(right_index, how=how, sort=sort)
+
+ # define condition for joining indexes
+ all_indices_equal = all(indexes[0].equals(index) for index in [indexes[1:]])
+ do_join_index = how is not None and not all_indices_equal
+
+ # define condition for joining indexes with getting indexers
+ need_indexers = (
+ axis == 0
+ and not all_indices_equal
+ and any(not index.is_unique for index in indexes)
+ )
+ indexers = None
+
+ # perform joining indexes
+ if do_join_index:
+ if len(indexes) == 2 and need_indexers:
+ # in case of count of indexes > 2 we should perform joining all indexes
+ # after that get indexers
+ # in the fast path we can obtain joined_index and indexers in one call
+ indexers = [None, None]
+ joined_index, indexers[0], indexers[1] = indexes[0].join(
+ indexes[1], how=how, sort=sort, return_indexers=True
+ )
else:
- return obj1.join(obj2, how=how, sort=sort)
-
- if isinstance(other_index, list):
- joined_obj = self.columns if axis else self.index
- # TODO: revisit for performance
- for obj in other_index:
- joined_obj = merge_index(joined_obj, obj)
- return joined_obj
- if axis:
- return merge_index(self.columns, other_index)
+ joined_index = indexes[0]
+ # TODO: revisit for performance
+ for index in indexes[1:]:
+ joined_index = merge(joined_index, index)
else:
- return self.index.join(other_index, how=how, sort=sort)
+ joined_index = indexes[0].copy()
+
+ if need_indexers and indexers is None:
+ indexers = [index.get_indexer_for(joined_index) for index in indexes]
+
+ def make_reindexer(do_reindex: bool, frame_idx: int):
+ # the order of the frames must match the order of the indexes
+ if not do_reindex:
+ return lambda df: df
+
+ if need_indexers:
+ assert indexers is not None
+
+ return lambda df: df._reindex_with_indexers(
+ {0: [joined_index, indexers[frame_idx]]},
+ copy=True,
+ allow_dups=True,
+ )
+
+ return lambda df: df.reindex(joined_index, axis=axis)
+
+ return joined_index, make_reindexer
# Internal methods
# These methods are for building the correct answer in a modular way.
@@ -955,11 +1021,15 @@ def _build_mapreduce_func(self, axis, func):
Note: This should be used for any MapReduce style operation that results in a
reduced data dimensionality (dataframe -> series).
- Args:
- axis: The axis along which to apply the function.
- func: The function to apply.
+ Parameters
+ ----------
+ axis: int
+ The axis along which to apply the function.
+ func: callable
+ The function to apply.
- Returns:
+ Returns
+ -------
A function to be shipped to the partitions to be executed.
"""
@@ -979,36 +1049,53 @@ def _map_reduce_func(df, *args, **kwargs):
return _map_reduce_func
- def _compute_map_reduce_metadata(self, axis, new_parts):
- if axis == 0:
- columns = self.columns
- index = ["__reduced__"]
- new_lengths = [1]
- new_widths = self._column_widths
+ def _compute_map_reduce_metadata(self, axis, new_parts, preserve_index=True):
+ """
+ Compute the metadata for the result of reduce function.
+
+ Parameters
+ ----------
+ axis: int,
+ The axis on which reduce function was applied
+ new_parts: numpy 2D array
+ Partitions with the result of applied function
+ preserve_index: boolean
+ The flag to preserve labels for the reduced axis.
+
+ Returns
+ -------
+ BasePandasFrame
+ Pandas series containing the reduced data.
+ """
+ new_axes, new_axes_lengths = [0, 0], [0, 0]
+
+ new_axes[axis] = ["__reduced__"]
+ if preserve_index:
+ new_axes[axis ^ 1] = self.axes[axis ^ 1]
+ else:
+ new_axes[axis ^ 1] = self._compute_axis_labels(axis ^ 1, new_parts)
+
+ new_axes_lengths[axis] = [1]
+ new_axes_lengths[axis ^ 1] = self._axes_lengths[axis ^ 1]
+
+ if axis == 0 or self._dtypes is None:
new_dtypes = self._dtypes
+ elif preserve_index:
+ new_dtypes = pandas.Series(
+ [find_common_type(self.dtypes.values)], index=new_axes[axis]
+ )
else:
- columns = ["__reduced__"]
- index = self.index
- new_lengths = self._row_lengths
- new_widths = [1]
- if self._dtypes is not None:
- new_dtypes = pandas.Series(
- np.full(1, find_common_type(self.dtypes.values)),
- index=["__reduced__"],
- )
- else:
- new_dtypes = self._dtypes
- return self.__constructor__(
+ new_dtypes = None
+ result = self.__constructor__(
new_parts,
- index,
- columns,
- new_lengths,
- new_widths,
+ *new_axes,
+ *new_axes_lengths,
new_dtypes,
- validate_axes="reduced",
)
+ result._apply_index_objs(axis)
+ return result
- def _fold_reduce(self, axis, func):
+ def _fold_reduce(self, axis, func, preserve_index=True):
"""
Apply function that reduce Manager to series but require knowledge of full axis.
@@ -1018,6 +1105,8 @@ def _fold_reduce(self, axis, func):
The axis to apply the function to (0 - index, 1 - columns).
func : callable
The function to reduce the Manager by. This function takes in a Manager.
+ preserve_index : boolean
+ The flag to preserve labels for the reduced axis.
Returns
-------
@@ -1028,7 +1117,9 @@ def _fold_reduce(self, axis, func):
new_parts = self._frame_mgr_cls.map_axis_partitions(
axis, self._partitions, func
)
- return self._compute_map_reduce_metadata(axis, new_parts)
+ return self._compute_map_reduce_metadata(
+ axis, new_parts, preserve_index=preserve_index
+ )
def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True):
"""
@@ -1058,31 +1149,18 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True):
else:
reduce_func = self._build_mapreduce_func(axis, reduce_func)
- map_parts = self._frame_mgr_cls.lazy_map_partitions(self._partitions, map_func)
+ map_parts = self._frame_mgr_cls.map_partitions(self._partitions, map_func)
reduce_parts = self._frame_mgr_cls.map_axis_partitions(
axis, map_parts, reduce_func
)
- if preserve_index:
- return self._compute_map_reduce_metadata(axis, reduce_parts)
- else:
- if axis == 0:
- new_index = ["__reduced__"]
- new_columns = self._frame_mgr_cls.get_indices(
- 1, reduce_parts, lambda df: df.columns
- )
- else:
- new_index = self._frame_mgr_cls.get_indices(
- 0, reduce_parts, lambda df: df.index
- )
- new_columns = ["__reduced__"]
- return self.__constructor__(
- reduce_parts, new_index, new_columns, validate_axes="reduced"
- )
+ return self._compute_map_reduce_metadata(
+ axis, reduce_parts, preserve_index=preserve_index
+ )
def _map(self, func, dtypes=None, validate_index=False, validate_columns=False):
"""Perform a function that maps across the entire dataset.
- Pamareters
+ Parameters
----------
func : callable
The function to apply.
@@ -1092,44 +1170,38 @@ def _map(self, func, dtypes=None, validate_index=False, validate_columns=False):
type, and allows us to avoid (re)computing it.
validate_index : bool, (default False)
Is index validation required after performing `func` on partitions.
+
Returns
-------
A new dataframe.
"""
- new_partitions = self._frame_mgr_cls.lazy_map_partitions(self._partitions, func)
+ new_partitions = self._frame_mgr_cls.map_partitions(self._partitions, func)
if dtypes == "copy":
dtypes = self._dtypes
elif dtypes is not None:
dtypes = pandas.Series(
[np.dtype(dtypes)] * len(self.columns), index=self.columns
)
- if validate_index:
- new_index = self._frame_mgr_cls.get_indices(
- 0, new_partitions, lambda df: df.index
- )
- else:
- new_index = self.index
- if len(new_index) != len(self.index):
- new_row_lengths = None
- else:
- new_row_lengths = self._row_lengths
- if validate_columns:
- new_columns = self._frame_mgr_cls.get_indices(
- 1, new_partitions, lambda df: df.columns
- )
- else:
- new_columns = self.columns
- if len(new_columns) != len(self.columns):
- new_column_widths = None
- else:
- new_column_widths = self._column_widths
+ axis_validate_mask = [validate_index, validate_columns]
+ new_axes = [
+ self._compute_axis_labels(axis, new_partitions)
+ if should_validate
+ else self.axes[axis]
+ for axis, should_validate in enumerate(axis_validate_mask)
+ ]
+
+ new_lengths = [
+ self._axes_lengths[axis]
+ if len(new_axes[axis]) == len(self.axes[axis])
+ else None
+ for axis in [0, 1]
+ ]
+
return self.__constructor__(
new_partitions,
- new_index,
- new_columns,
- new_row_lengths,
- new_column_widths,
+ *new_axes,
+ *new_lengths,
dtypes=dtypes,
)
@@ -1138,11 +1210,15 @@ def _fold(self, axis, func):
Note: The data shape is not changed (length and width of the table).
- Args:
- axis: The axis to apply over.
- func: The function to apply.
+ Parameters
+ ----------
+ axis: int
+ The axis to apply over.
+ func: callable
+ The function to apply.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
new_partitions = self._frame_mgr_cls.map_axis_partitions(
@@ -1159,37 +1235,33 @@ def _fold(self, axis, func):
def filter_full_axis(self, axis, func):
"""Filter data based on the function provided along an entire axis.
- Args:
- axis: The axis to filter over.
- func: The function to use for the filter. This function should filter the
+ Parameters
+ ----------
+ axis: int
+ The axis to filter over.
+ func: callable
+ The function to use for the filter. This function should filter the
data itself.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
new_partitions = self._frame_mgr_cls.map_axis_partitions(
axis, self._partitions, func, keep_partitioning=True
)
- if axis == 0:
- new_index = self.index
- new_lengths = self._row_lengths
- new_widths = None # We do not know what the resulting widths will be
- new_columns = self._frame_mgr_cls.get_indices(
- 1, new_partitions, lambda df: df.columns
- )
- else:
- new_columns = self.columns
- new_lengths = None # We do not know what the resulting lengths will be
- new_widths = self._column_widths
- new_index = self._frame_mgr_cls.get_indices(
- 0, new_partitions, lambda df: df.index
- )
+ new_axes, new_lengths = [0, 0], [0, 0]
+
+ new_axes[axis] = self.axes[axis]
+ new_axes[axis ^ 1] = self._compute_axis_labels(axis ^ 1, new_partitions)
+
+ new_lengths[axis] = self._axes_lengths[axis]
+ new_lengths[axis ^ 1] = None # We do not know what the resulting widths will be
+
return self.__constructor__(
new_partitions,
- new_index,
- new_columns,
- new_lengths,
- new_widths,
+ *new_axes,
+ *new_lengths,
self.dtypes if axis == 0 else None,
)
@@ -1251,18 +1323,27 @@ def _apply_full_axis_select_indices(
):
"""Apply a function across an entire axis for a subset of the data.
- Args:
- axis: The axis to apply over.
- func: The function to apply
- apply_indices: The labels to apply over.
- numeric_indices: The indices to apply over.
- new_index: (optional) The index of the result. We may know this in advance,
+ Parameters
+ ----------
+ axis: int
+ The axis to apply over.
+ func: callable
+ The function to apply
+ apply_indices: list-like
+ The labels to apply over.
+ numeric_indices: list-like
+ The indices to apply over.
+ new_index: list-like (optional)
+ The index of the result. We may know this in advance,
and if not provided it must be computed.
- new_columns: (optional) The columns of the result. We may know this in
+ new_columns: list-like (optional)
+ The columns of the result. We may know this in
advance, and if not provided it must be computed.
- keep_remaining: Whether or not to drop the data that is not computed over.
+ keep_remaining: boolean
+ Whether or not to drop the data that is not computed over.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
assert apply_indices is not None or numeric_indices is not None
@@ -1303,7 +1384,8 @@ def _apply_select_indices(
):
"""Apply a function for a subset of the data.
- Args:
+ Parameters
+ ----------
axis: The axis to apply over.
func: The function to apply
apply_indices: (optional) The labels to apply over. Must be given if axis is
@@ -1320,7 +1402,8 @@ def _apply_select_indices(
item_to_distribute: (optional) The item to split up so it can be applied
over both axes.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
# TODO Infer columns and index from `keep_remaining` and `apply_indices`
@@ -1380,23 +1463,34 @@ def _apply_select_indices(
self._column_widths_cache,
)
- def broadcast_apply(self, axis, func, other, preserve_labels=True, dtypes=None):
- """Broadcast partitions of other dataframe partitions and apply a function.
+ def broadcast_apply(
+ self, axis, func, other, join_type="left", preserve_labels=True, dtypes=None
+ ):
+ """
+ Broadcast partitions of other dataframe partitions and apply a function.
- Args:
- axis: The axis to broadcast over.
- func: The function to apply.
- other: The Modin DataFrame to broadcast.
- preserve_labels: Whether or not to keep labels from this Modin DataFrame.
- dtypes: "copy" or None. Whether to keep old dtypes or infer new dtypes from
- data.
-
- Returns:
- A new Modin DataFrame
+ Parameters
+ ----------
+ axis: int,
+ The axis to broadcast over.
+ func: callable,
+ The function to apply.
+ other: BasePandasFrame
+ The Modin DataFrame to broadcast.
+ join_type: str (optional)
+ The type of join to apply.
+ preserve_labels: boolean (optional)
+ Whether or not to keep labels from this Modin DataFrame.
+ dtypes: "copy" or None (optional)
+ Whether to keep old dtypes or infer new dtypes from data.
+
+ Returns
+ -------
+ BasePandasFrame
"""
# Only sort the indices if they do not match
left_parts, right_parts, joined_index = self._copartition(
- axis, other, "left", sort=not self.axes[axis].equals(other.axes[axis])
+ axis, other, join_type, sort=not self.axes[axis].equals(other.axes[axis])
)
# unwrap list returned by `copartition`.
right_parts = right_parts[0]
@@ -1418,7 +1512,7 @@ def broadcast_apply(self, axis, func, other, preserve_labels=True, dtypes=None):
def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all):
"""
- Computes indices to broadcast `self` with considering of `indices`
+ Compute the indices to broadcast `self` with considering of `indices`.
Parameters
----------
@@ -1468,8 +1562,7 @@ def broadcast_apply_select_indices(
new_columns=None,
):
"""
- Applyies `func` to select indices at specified axis and broadcasts
- partitions of `other` frame.
+ Apply `func` to select indices at specified axis and broadcasts partitions of `other` frame.
Parameters
----------
@@ -1530,15 +1623,15 @@ def broadcast_apply_select_indices(
broadcasted_dict,
keep_remaining,
)
- if new_index is None:
- new_index = self._frame_mgr_cls.get_indices(
- 0, new_partitions, lambda df: df.index
- )
- if new_columns is None:
- new_columns = self._frame_mgr_cls.get_indices(
- 1, new_partitions, lambda df: df.columns
- )
- return self.__constructor__(new_partitions, new_index, new_columns)
+
+ new_axes = [
+ self._compute_axis_labels(i, new_partitions)
+ if new_axis is None
+ else new_axis
+ for i, new_axis in enumerate([new_index, new_columns])
+ ]
+
+ return self.__constructor__(new_partitions, *new_axes)
def broadcast_apply_full_axis(
self,
@@ -1547,6 +1640,8 @@ def broadcast_apply_full_axis(
other,
new_index=None,
new_columns=None,
+ apply_indices=None,
+ enumerate_partitions=False,
dtypes=None,
):
"""Broadcast partitions of other dataframe partitions and apply a function along full axis.
@@ -1557,13 +1652,18 @@ def broadcast_apply_full_axis(
The axis to apply over (0 - rows, 1 - columns).
func : callable
The function to apply.
- other : other Modin frame to broadcast
+ other : others Modin frames to broadcast
new_index : list-like (optional)
The index of the result. We may know this in advance,
and if not provided it must be computed.
new_columns : list-like (optional)
The columns of the result. We may know this in
advance, and if not provided it must be computed.
+ apply_indices : list-like (optional),
+ Indices of `axis ^ 1` to apply function over.
+ enumerate_partitions : bool (optional, default False),
+ Whether or not to pass partition index into applied `func`.
+ Note that `func` must be able to obtain `partition_idx` kwarg.
dtypes : list-like (optional)
The data types of the result. This is an optimization
because there are functions that always result in a particular data
@@ -1573,57 +1673,73 @@ def broadcast_apply_full_axis(
-------
A new Modin DataFrame
"""
+ if other is not None:
+ if not isinstance(other, list):
+ other = [other]
+ other = [o._partitions for o in other] if len(other) else None
+
+ if apply_indices is not None:
+ numeric_indices = self.axes[axis ^ 1].get_indexer_for(apply_indices)
+ apply_indices = self._get_dict_of_block_index(
+ axis ^ 1, numeric_indices
+ ).keys()
+
new_partitions = self._frame_mgr_cls.broadcast_axis_partitions(
axis=axis,
left=self._partitions,
- right=other if other is None else other._partitions,
+ right=other,
apply_func=self._build_mapreduce_func(axis, func),
+ apply_indices=apply_indices,
+ enumerate_partitions=enumerate_partitions,
keep_partitioning=True,
)
# Index objects for new object creation. This is shorter than if..else
- if new_columns is None:
- new_columns = self._frame_mgr_cls.get_indices(
- 1, new_partitions, lambda df: df.columns
- )
- if new_index is None:
- new_index = self._frame_mgr_cls.get_indices(
- 0, new_partitions, lambda df: df.index
- )
+ new_axes = [
+ self._compute_axis_labels(i, new_partitions)
+ if new_axis is None
+ else new_axis
+ for i, new_axis in enumerate([new_index, new_columns])
+ ]
if dtypes == "copy":
dtypes = self._dtypes
elif dtypes is not None:
dtypes = pandas.Series(
- [np.dtype(dtypes)] * len(new_columns), index=new_columns
+ [np.dtype(dtypes)] * len(new_axes[1]), index=new_axes[1]
)
- return self.__constructor__(
+ result = self.__constructor__(
new_partitions,
- new_index,
- new_columns,
+ *new_axes,
None,
None,
dtypes,
- validate_axes="all" if new_partitions.size != 0 else False,
)
+ if new_index is not None:
+ result._apply_index_objs(0)
+ if new_columns is not None:
+ result._apply_index_objs(1)
+ return result
def _copartition(self, axis, other, how, sort, force_repartition=False):
"""
Copartition two dataframes.
+ Perform aligning of partitions, index and partition blocks.
+
Parameters
----------
- axis : 0 or 1
- The axis to copartition along (0 - rows, 1 - columns).
- other : BasePandasFrame
- The other dataframes(s) to copartition against.
- how : str
- How to manage joining the index object ("left", "right", etc.)
- sort : boolean
- Whether or not to sort the joined index.
- force_repartition : boolean
- Whether or not to force the repartitioning. By default,
- this method will skip repartitioning if it is possible. This is because
- reindexing is extremely inefficient. Because this method is used to
- `join` or `append`, it is vital that the internal indices match.
+ axis : 0 or 1
+ The axis to copartition along (0 - rows, 1 - columns).
+ other : BasePandasFrame
+ The other dataframes(s) to copartition against.
+ how : str
+ How to manage joining the index object ("left", "right", etc.)
+ sort : boolean
+ Whether or not to sort the joined index.
+ force_repartition : bool, default False
+ Whether or not to force the repartitioning. By default,
+ this method will skip repartitioning if it is possible. This is because
+ reindexing is extremely inefficient. Because this method is used to
+ `join` or `append`, it is vital that the internal indices match.
Returns
-------
@@ -1632,39 +1748,89 @@ def _copartition(self, axis, other, how, sort, force_repartition=False):
"""
if isinstance(other, type(self)):
other = [other]
- if all(o.axes[axis].equals(self.axes[axis]) for o in other):
- return (
- self._partitions,
- [self._simple_shuffle(axis, o) for o in other],
- self.axes[axis].copy(),
- )
- index_other_obj = [o.axes[axis] for o in other]
- joined_index = self._join_index_objects(axis, index_other_obj, how, sort)
- # We have to set these because otherwise when we perform the functions it may
- # end up serializing this entire object.
- left_old_idx = self.axes[axis]
- right_old_idxes = index_other_obj
-
- # Start with this and we'll repartition the first time, and then not again.
- if not left_old_idx.equals(joined_index) or force_repartition:
- reindexed_self = self._frame_mgr_cls.map_axis_partitions(
- axis, self._partitions, lambda df: df.reindex(joined_index, axis=axis)
+
+ # define helper functions
+ def get_axis_lengths(partitions, axis):
+ if axis:
+ return [obj.width() for obj in partitions[0]]
+ return [obj.length() for obj in partitions.T[0]]
+
+ self_index = self.axes[axis]
+ others_index = [o.axes[axis] for o in other]
+ joined_index, make_reindexer = self._join_index_objects(
+ axis, [self_index] + others_index, how, sort
+ )
+
+ frames = [self] + other
+ non_empty_frames_idx = [
+ i for i, o in enumerate(frames) if o._partitions.size != 0
+ ]
+
+ # If all frames are empty
+ if len(non_empty_frames_idx) == 0:
+ return self._partitions, [o._partitions for o in other], joined_index
+
+ base_frame_idx = non_empty_frames_idx[0]
+ base_frame = frames[base_frame_idx]
+
+ other_frames = frames[base_frame_idx + 1 :]
+
+ # Picking first non-empty frame
+ base_frame = frames[non_empty_frames_idx[0]]
+ base_index = base_frame.axes[axis]
+
+ # define conditions for reindexing and repartitioning `self` frame
+ do_reindex_base = not base_index.equals(joined_index)
+ do_repartition_base = force_repartition or do_reindex_base
+
+ # perform repartitioning and reindexing for `base_frame` if needed
+ if do_repartition_base:
+ reindexed_base = base_frame._frame_mgr_cls.map_axis_partitions(
+ axis,
+ base_frame._partitions,
+ make_reindexer(do_reindex_base, base_frame_idx),
)
else:
- reindexed_self = self._partitions
- reindexed_other_list = []
+ reindexed_base = base_frame._partitions
- for i in range(len(other)):
- if right_old_idxes[i].equals(joined_index) and not force_repartition:
- reindexed_other = other[i]._partitions
- else:
- reindexed_other = other[i]._frame_mgr_cls.map_axis_partitions(
+ # define length of base and `other` frames to aligning purpose
+ base_lengths = get_axis_lengths(reindexed_base, axis)
+ others_lengths = [o._axes_lengths[axis] for o in other_frames]
+
+ # define conditions for reindexing and repartitioning `other` frames
+ do_reindex_others = [
+ not o.axes[axis].equals(joined_index) for o in other_frames
+ ]
+
+ do_repartition_others = [None] * len(other_frames)
+ for i in range(len(other_frames)):
+ do_repartition_others[i] = (
+ force_repartition
+ or do_reindex_others[i]
+ or others_lengths[i] != base_lengths
+ )
+
+ # perform repartitioning and reindexing for `other_frames` if needed
+ reindexed_other_list = [None] * len(other_frames)
+ for i in range(len(other_frames)):
+ if do_repartition_others[i]:
+ # indices of others frame start from `base_frame_idx` + 1
+ reindexed_other_list[i] = other_frames[
+ i
+ ]._frame_mgr_cls.map_axis_partitions(
axis,
- other[i]._partitions,
- lambda df: df.reindex(joined_index, axis=axis),
+ other_frames[i]._partitions,
+ make_reindexer(do_repartition_others[i], base_frame_idx + 1 + i),
+ lengths=base_lengths,
)
- reindexed_other_list.append(reindexed_other)
- return reindexed_self, reindexed_other_list, joined_index
+ else:
+ reindexed_other_list[i] = other_frames[i]._partitions
+ reindexed_frames = (
+ [frames[i]._partitions for i in range(base_frame_idx)]
+ + [reindexed_base]
+ + reindexed_other_list
+ )
+ return reindexed_frames[0], reindexed_frames[1:], joined_index
def _simple_shuffle(self, axis, other):
"""
@@ -1725,18 +1891,24 @@ def _binary_op(self, op, right_frame, join_type="outer"):
1, left_parts, lambda l, r: op(l, r), right_parts
)
new_columns = self.columns.join(right_frame.columns, how=join_type)
- return self.__constructor__(new_frame, self.index, new_columns, None, None)
+ return self.__constructor__(new_frame, joined_index, new_columns, None, None)
def _concat(self, axis, others, how, sort):
"""Concatenate this dataframe with one or more others.
- Args:
- axis: The axis to concatenate over.
- others: The list of dataframes to concatenate with.
- how: The type of join to use for the axis.
- sort: Whether or not to sort the result.
+ Parameters
+ ----------
+ axis: int
+ The axis to concatenate over.
+ others: List of dataframes
+ The list of dataframes to concatenate with.
+ how: str
+ The type of join to use for the axis.
+ sort: boolean
+ Whether or not to sort the result.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
# Fast path for equivalent columns and partitioning
@@ -1766,7 +1938,7 @@ def _concat(self, axis, others, how, sort):
]
else:
left_parts, right_parts, joined_index = self._copartition(
- axis ^ 1, others, how, sort, force_repartition=True
+ axis ^ 1, others, how, sort, force_repartition=False
)
new_lengths = None
new_widths = None
@@ -1788,44 +1960,70 @@ def _concat(self, axis, others, how, sort):
)
def groupby_reduce(
- self, axis, by, map_func, reduce_func, new_index=None, new_columns=None
+ self,
+ axis,
+ by,
+ map_func,
+ reduce_func,
+ new_index=None,
+ new_columns=None,
+ apply_indices=None,
):
"""Groupby another dataframe and aggregate the result.
- Args:
- axis: The axis to groupby and aggregate over.
- by: The dataframe to group by.
- map_func: The map component of the aggregation.
- reduce_func: The reduce component of the aggregation.
- new_index: (optional) The index of the result. We may know this in advance,
+ Parameters
+ ----------
+ axis: int,
+ The axis to groupby and aggregate over.
+ by: ModinFrame (optional),
+ The dataframe to group by.
+ map_func: callable,
+ The map component of the aggregation.
+ reduce_func: callable,
+ The reduce component of the aggregation.
+ new_index: Index (optional),
+ The index of the result. We may know this in advance,
and if not provided it must be computed.
- new_columns: (optional) The columns of the result. We may know this in
- advance, and if not provided it must be computed.
+ new_columns: Index (optional),
+ The columns of the result. We may know this in advance,
+ and if not provided it must be computed.
+ apply_indices : list-like (optional),
+ Indices of `axis ^ 1` to apply groupby over.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
+ by_parts = by if by is None else by._partitions
+
+ if apply_indices is not None:
+ numeric_indices = self.axes[axis ^ 1].get_indexer_for(apply_indices)
+ apply_indices = list(
+ self._get_dict_of_block_index(axis ^ 1, numeric_indices).keys()
+ )
+
new_partitions = self._frame_mgr_cls.groupby_reduce(
- axis, self._partitions, by._partitions, map_func, reduce_func
+ axis, self._partitions, by_parts, map_func, reduce_func, apply_indices
)
- if new_columns is None:
- new_columns = self._frame_mgr_cls.get_indices(
- 1, new_partitions, lambda df: df.columns
- )
- if new_index is None:
- new_index = self._frame_mgr_cls.get_indices(
- 0, new_partitions, lambda df: df.index
- )
- return self.__constructor__(new_partitions, new_index, new_columns)
+ new_axes = [
+ self._compute_axis_labels(i, new_partitions)
+ if new_axis is None
+ else new_axis
+ for i, new_axis in enumerate([new_index, new_columns])
+ ]
+
+ return self.__constructor__(new_partitions, *new_axes)
@classmethod
def from_pandas(cls, df):
"""Improve simple Pandas DataFrame to an advanced and superior Modin DataFrame.
- Args:
+ Parameters
+ ----------
df: Pandas DataFrame object.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
new_index = df.index
@@ -1881,9 +2079,10 @@ def _arrow_type_to_dtype(cls, arrow_type):
return res
def to_pandas(self):
- """Converts Modin DataFrame to Pandas DataFrame.
+ """Convert a Modin DataFrame to Pandas DataFrame.
- Returns:
+ Returns
+ -------
Pandas DataFrame.
"""
df = self._frame_mgr_cls.to_pandas(self._partitions)
@@ -1905,7 +2104,7 @@ def to_pandas(self):
def to_numpy(self, **kwargs):
"""
- Converts Modin DataFrame to a 2D NumPy array.
+ Convert a Modin DataFrame to a 2D NumPy array.
Returns
-------
@@ -1916,7 +2115,8 @@ def to_numpy(self, **kwargs):
def transpose(self):
"""Transpose the index and columns of this dataframe.
- Returns:
+ Returns
+ -------
A new dataframe.
"""
new_partitions = self._frame_mgr_cls.lazy_map_partitions(
diff --git a/modin/engines/base/frame/partition.py b/modin/engines/base/frame/partition.py
index 8854b346e77..ec22573af37 100644
--- a/modin/engines/base/frame/partition.py
+++ b/modin/engines/base/frame/partition.py
@@ -11,24 +11,18 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
-NOT_IMPLEMENTED_MESSAGE = "Must be implemented in child class"
+from abc import ABC
-class BaseFramePartition(object): # pragma: no cover
- """This abstract class holds the data and metadata for a single partition.
- The methods required for implementing this abstract class are listed in
- the section immediately following this.
+class BaseFramePartition(ABC): # pragma: no cover
+ """An abstract class that holds the data and metadata for a single partition.
- The API exposed by the children of this object is used in
- `BaseFrameManager`.
+ The public API exposed by the children of this object is used in `BaseFrameManager`.
Note: These objects are treated as immutable by `BaseFrameManager`
subclasses. There is no logic for updating inplace.
"""
- # Abstract methods and fields. These must be implemented in order to
- # properly subclass this object. There are also some abstract classmethods
- # to implement.
def get(self):
"""Return the object wrapped by this one to the original format.
@@ -36,10 +30,11 @@ def get(self):
E.g. if you assign `x = BaseFramePartition.put(1)`, `x.get()` should
always return 1.
- Returns:
+ Returns
+ -------
The object that was `put`.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
def apply(self, func, **kwargs):
"""Apply some callable function to the data in this partition.
@@ -48,25 +43,38 @@ def apply(self, func, **kwargs):
an important part of many implementations. As of right now, they
are not serialized.
- Args:
- func: The lambda to apply (may already be correctly formatted)
+ Args
+ ----
+ func : callable
+ The function to apply.
- Returns:
+ Returns
+ -------
A new `BaseFramePartition` containing the object that has had `func`
applied to it.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
def add_to_apply_calls(self, func, **kwargs):
"""Add the function to the apply function call stack.
- This function will be executed when apply is called. It will be executed
+ Note: This function will be executed when apply is called. It will be executed
in the order inserted; apply's func operates the last and return
+
+ Args
+ ----
+ func : callable
+ The function to apply.
+
+ Returns
+ -------
+ A new `BaseFramePartition` with the function added to the call queue.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
def drain_call_queue(self):
"""Execute all functionality stored in the call queue."""
+ pass
def to_pandas(self):
"""Convert the object stored in this partition to a Pandas DataFrame.
@@ -74,10 +82,11 @@ def to_pandas(self):
Note: If the underlying object is a Pandas DataFrame, this will likely
only need to call `get`
- Returns:
+ Returns
+ -------
A Pandas DataFrame.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
def to_numpy(self, **kwargs):
"""Convert the object stored in this partition to a NumPy array.
@@ -85,10 +94,11 @@ def to_numpy(self, **kwargs):
Note: If the underlying object is a Pandas DataFrame, this will return
a 2D NumPy array.
- Returns:
+ Returns
+ -------
A NumPy array.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
def mask(self, row_indices, col_indices):
"""Lazily create a mask that extracts the indices provided.
@@ -97,22 +107,26 @@ def mask(self, row_indices, col_indices):
row_indices: The indices for the rows to extract.
col_indices: The indices for the columns to extract.
- Returns:
+ Returns
+ -------
A `BaseFramePartition` object.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
@classmethod
def put(cls, obj):
- """A factory classmethod to format a given object.
+ """Format a given object.
- Args:
- obj: An object.
+ Parameters
+ ----------
+ obj: object
+ An object.
- Returns:
+ Returns
+ -------
A `BaseFramePartition` object.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
@classmethod
def preprocess_func(cls, func):
@@ -123,36 +137,42 @@ def preprocess_func(cls, func):
deploy a preprocessed function to multiple `BaseFramePartition`
objects.
- Args:
- func: The function to preprocess.
+ Args
+ ----
+ func : callable
+ The function to preprocess.
- Returns:
+ Returns
+ -------
An object that can be accepted by `apply`.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
@classmethod
def length_extraction_fn(cls):
- """The function to compute the length of the object in this partition.
+ """Compute the length of the object in this partition.
- Returns:
+ Returns
+ -------
A callable function.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
@classmethod
def width_extraction_fn(cls):
- """The function to compute the width of the object in this partition.
+ """Compute the width of the object in this partition.
- Returns:
+ Returns
+ -------
A callable function.
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
_length_cache = None
_width_cache = None
def length(self):
+ """Return the length of partition."""
if self._length_cache is None:
cls = type(self)
func = cls.length_extraction_fn()
@@ -161,6 +181,7 @@ def length(self):
return self._length_cache
def width(self):
+ """Return the width of partition."""
if self._width_cache is None:
cls = type(self)
func = cls.width_extraction_fn()
@@ -170,9 +191,10 @@ def width(self):
@classmethod
def empty(cls):
- """Create an empty partition
+ """Create an empty partition.
- Returns;
+ Returns
+ -------
An empty partition
"""
- raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
+ pass
diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py
index ff00a3e8559..068eaf022b1 100644
--- a/modin/engines/base/frame/partition_manager.py
+++ b/modin/engines/base/frame/partition_manager.py
@@ -11,17 +11,23 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
+from abc import ABC
import numpy as np
import pandas
from modin.error_message import ErrorMessage
from modin.data_management.utils import compute_chunksize
+from modin.config import NPartitions
+
from pandas.api.types import union_categoricals
-class BaseFrameManager(object):
- # Partition class is the class to use for storing each partition. It must
- # extend the `BaseFramePartition` class.
+class BaseFrameManager(ABC):
+ """Partition class is the class to use for storing each partition. It must extend the `BaseFramePartition` class.
+
+ It is the base class for managing the dataframe data layout and operators.
+ """
+
_partition_class = None
# Column partitions class is the class to use to create the column partitions.
_column_partitions_class = None
@@ -43,6 +49,7 @@ def preprocess_func(cls, map_func):
map_func: The function to be preprocessed.
Returns
+ -------
The preprocessed version of the `map_func` provided. Note: This
does not require any specific format, only that the
`BaseFramePartition.apply` method will recognize it (For the subclass
@@ -54,28 +61,39 @@ def preprocess_func(cls, map_func):
@classmethod
def column_partitions(cls, partitions):
- """A list of `BaseFrameAxisPartition` objects.
+ """List of `BaseFrameAxisPartition` objects.
Note: Each value in this list will be an `BaseFrameAxisPartition` object.
`BaseFrameAxisPartition` is located in `axis_partition.py`.
- Returns a list of `BaseFrameAxisPartition` objects.
+ Returns
+ -------
+ a list of `BaseFrameAxisPartition` objects.
"""
- return [cls._column_partitions_class(col) for col in partitions.T]
+ if not isinstance(partitions, list):
+ partitions = [partitions]
+ return [
+ cls._column_partitions_class(col) for frame in partitions for col in frame.T
+ ]
@classmethod
def row_partitions(cls, partitions):
- """A list of `BaseFrameAxisPartition` objects, represents column partitions.
+ """List of `BaseFrameAxisPartition` objects, represents column partitions.
Note: Each value in this list will an `BaseFrameAxisPartition` object.
`BaseFrameAxisPartition` is located in `axis_partition.py`.
- Returns a list of `BaseFrameAxisPartition` objects.
+ Returns
+ -------
+ a list of `BaseFrameAxisPartition` objects.
"""
- return [cls._row_partition_class(row) for row in partitions]
+ if not isinstance(partitions, list):
+ partitions = [partitions]
+ return [cls._row_partition_class(row) for frame in partitions for row in frame]
@classmethod
def axis_partition(cls, partitions, axis):
+ """Logically partition along either the columns or the rows."""
return (
cls.column_partitions(partitions)
if not axis
@@ -83,11 +101,45 @@ def axis_partition(cls, partitions, axis):
)
@classmethod
- def groupby_reduce(cls, axis, partitions, by, map_func, reduce_func):
- mapped_partitions = cls.broadcast_apply(
- axis, map_func, left=partitions, right=by, other_name="other"
+ def groupby_reduce(
+ cls, axis, partitions, by, map_func, reduce_func, apply_indices=None
+ ):
+ """
+ Groupby data using the map_func provided along the axis over the partitions then reduce using reduce_func.
+
+ Parameters
+ ----------
+ axis: int,
+ Axis to groupby over.
+ partitions: numpy 2D array,
+ Partitions of the ModinFrame to groupby.
+ by: numpy 2D array (optional),
+ Partitions of 'by' to broadcast.
+ map_func: callable,
+ Map function.
+ reduce_func: callable,
+ Reduce function.
+ apply_indices : list of ints (optional),
+ Indices of `axis ^ 1` to apply function over.
+
+ Returns
+ -------
+ Partitions with applied groupby.
+ """
+ if apply_indices is not None:
+ partitions = (
+ partitions[apply_indices] if axis else partitions[:, apply_indices]
+ )
+
+ if by is not None:
+ mapped_partitions = cls.broadcast_apply(
+ axis, map_func, left=partitions, right=by, other_name="other"
+ )
+ else:
+ mapped_partitions = cls.map_partitions(partitions, map_func)
+ return cls.map_axis_partitions(
+ axis, mapped_partitions, reduce_func, enumerate_partitions=True
)
- return cls.map_axis_partitions(axis, mapped_partitions, reduce_func)
@classmethod
def broadcast_apply_select_indices(
@@ -101,7 +153,7 @@ def broadcast_apply_select_indices(
keep_remaining=False,
):
"""
- Broadcast the right partitions to left and apply a function to selected indices
+ Broadcast the right partitions to left and apply a function to selected indices.
Note: Your internal function must take this kwargs:
[`internal_indices`, `other`, `internal_other_indices`] to work correctly
@@ -194,7 +246,7 @@ def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"):
new_partitions = np.array(
[
[
- part.add_to_apply_calls(
+ part.apply(
apply_func,
**{other_name: right[col_idx] if axis else right[row_idx]},
)
@@ -214,30 +266,42 @@ def broadcast_axis_partitions(
left,
right,
keep_partitioning=False,
+ apply_indices=None,
+ enumerate_partitions=False,
+ lengths=None,
):
"""
Broadcast the right partitions to left and apply a function along full axis.
Parameters
----------
- axis : The axis to apply and broadcast over.
- apply_func : The function to apply.
- left : The left partitions.
- right : The right partitions.
- keep_partitioning : boolean. Default is False
- The flag to keep partitions for Modin Frame.
+ axis : The axis to apply and broadcast over.
+ apply_func : The function to apply.
+ left : The left partitions.
+ right : The right partitions.
+ keep_partitioning : boolean. Default is False
+ The flag to keep partitions for Modin Frame.
+ apply_indices : list of ints (optional),
+ Indices of `axis ^ 1` to apply function over.
+ enumerate_partitions : bool (optional, default False),
+ Whether or not to pass partition index into `apply_func`.
+ Note that `apply_func` must be able to obtain `partition_idx` kwarg.
+ lengths : list(int), default None
+ The list of lengths to shuffle the object.
Returns
-------
- A new `np.array` of partition objects.
+ A new `np.array` of partition objects.
"""
# Since we are already splitting the DataFrame back up after an
# operation, we will just use this time to compute the number of
# partitions as best we can right now.
if keep_partitioning:
num_splits = len(left) if axis == 0 else len(left.T)
+ elif lengths:
+ num_splits = len(lengths)
else:
- num_splits = cls._compute_num_partitions()
+ num_splits = NPartitions.get()
preprocessed_map_func = cls.preprocess_func(apply_func)
left_partitions = cls.axis_partition(left, axis)
right_partitions = None if right is None else cls.axis_partition(right, axis)
@@ -245,14 +309,25 @@ def broadcast_axis_partitions(
# may want to line to partitioning up with another BlockPartitions object. Since
# we don't need to maintain the partitioning, this gives us the opportunity to
# load-balance the data as well.
+ kw = {
+ "num_splits": num_splits,
+ "other_axis_partition": right_partitions,
+ }
+ if lengths:
+ kw["_lengths"] = lengths
+ kw["manual_partition"] = True
+
+ if apply_indices is None:
+ apply_indices = np.arange(len(left_partitions))
+
result_blocks = np.array(
[
- part.apply(
+ left_partitions[i].apply(
preprocessed_map_func,
- num_splits=num_splits,
- other_axis_partition=right_partitions,
+ **kw,
+ **({"partition_idx": idx} if enumerate_partitions else {}),
)
- for part in left_partitions
+ for idx, i in enumerate(apply_indices)
]
)
# If we are mapping over columns, they are returned to use the same as
@@ -262,12 +337,15 @@ def broadcast_axis_partitions(
@classmethod
def map_partitions(cls, partitions, map_func):
- """Applies `map_func` to every partition.
+ """Apply `map_func` to every partition.
- Args:
- map_func: The function to apply.
+ Parameters
+ ----------
+ map_func: callable
+ The function to apply.
- Returns:
+ Returns
+ -------
A new BaseFrameManager object, the type of object that called this.
"""
preprocessed_map_func = cls.preprocess_func(map_func)
@@ -280,6 +358,18 @@ def map_partitions(cls, partitions, map_func):
@classmethod
def lazy_map_partitions(cls, partitions, map_func):
+ """
+ Apply `map_func` to every partition lazily.
+
+ Parameters
+ ----------
+ map_func: callable
+ The function to apply.
+
+ Returns
+ -------
+ A new BaseFrameManager object, the type of object that called this.
+ """
preprocessed_map_func = cls.preprocess_func(map_func)
return np.array(
[
@@ -295,20 +385,27 @@ def map_axis_partitions(
partitions,
map_func,
keep_partitioning=False,
+ lengths=None,
+ enumerate_partitions=False,
):
"""
- Applies `map_func` to every partition.
+ Apply `map_func` to every partition.
Parameters
----------
- axis : 0 or 1
- The axis to perform the map across (0 - index, 1 - columns).
- partitions : NumPy array
- The partitions of Modin Frame.
- map_func : callable
- The function to apply.
- keep_partitioning : boolean. Default is False
- The flag to keep partitions for Modin Frame.
+ axis : 0 or 1
+ The axis to perform the map across (0 - index, 1 - columns).
+ partitions : NumPy array
+ The partitions of Modin Frame.
+ map_func : callable
+ The function to apply.
+ keep_partitioning : bool. Default is False
+ The flag to keep partitions for Modin Frame.
+ lengths : list(int)
+ The list of lengths to shuffle the object.
+ enumerate_partitions : bool (optional, default False),
+ Whether or not to pass partition index into `map_func`.
+ Note that `map_func` must be able to obtain `partition_idx` kwarg.
Returns
-------
@@ -326,12 +423,14 @@ def map_axis_partitions(
apply_func=map_func,
keep_partitioning=keep_partitioning,
right=None,
+ lengths=lengths,
+ enumerate_partitions=enumerate_partitions,
)
@classmethod
def simple_shuffle(cls, axis, partitions, map_func, lengths):
"""
- Shuffle data using `lengths` via `map_func`
+ Shuffle data using `lengths` via `map_func`.
Parameters
----------
@@ -381,7 +480,8 @@ def concat(cls, axis, left_parts, right_parts):
right_parts: the other blocks to be concatenated. This is a
BaseFrameManager object.
- Returns:
+ Returns
+ -------
A new BaseFrameManager object, the type of object that called this.
"""
if type(right_parts) is list:
@@ -389,14 +489,19 @@ def concat(cls, axis, left_parts, right_parts):
# but `np.concatenate` can concatenate arrays only if its shapes at
# specified axis are equals, so filtering empty frames to avoid concat error
right_parts = [o for o in right_parts if o.size != 0]
- return np.concatenate([left_parts] + right_parts, axis=axis)
+ to_concat = (
+ [left_parts] + right_parts if left_parts.size != 0 else right_parts
+ )
+ return (
+ np.concatenate(to_concat, axis=axis) if len(to_concat) else left_parts
+ )
else:
return np.append(left_parts, right_parts, axis=axis)
@classmethod
def concatenate(cls, dfs):
"""
- Concatenate Pandas DataFrames with saving 'category' dtype
+ Concatenate Pandas DataFrames with saving 'category' dtype.
Parameters
----------
@@ -421,7 +526,8 @@ def concatenate(cls, dfs):
def to_pandas(cls, partitions):
"""Convert this object into a Pandas DataFrame from the partitions.
- Returns:
+ Returns
+ -------
A Pandas DataFrame
"""
retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions]
@@ -462,7 +568,8 @@ def to_numpy(cls, partitions, **kwargs):
@classmethod
def from_pandas(cls, df, return_dims=False):
- num_splits = cls._compute_num_partitions()
+ """Return the partitions from Pandas DataFrame."""
+ num_splits = NPartitions.get()
put_func = cls._partition_class.put
row_chunksize, col_chunksize = compute_chunksize(df, num_splits)
parts = [
@@ -491,11 +598,12 @@ def from_pandas(cls, df, return_dims=False):
@classmethod
def from_arrow(cls, at, return_dims=False):
+ """Return the partitions from Apache Arrow (PyArrow)."""
return cls.from_pandas(at.to_pandas(), return_dims=return_dims)
@classmethod
def get_indices(cls, axis, partitions, index_func=None):
- """This gets the internal indices stored in the partitions.
+ """Get the internal indices stored in the partitions.
Note: These are the global indices of the object. This is mostly useful
when you have deleted rows/columns internally, but do not know
@@ -505,7 +613,8 @@ def get_indices(cls, axis, partitions, index_func=None):
axis: This axis to extract the labels. (0 - index, 1 - columns).
index_func: The function to be used to extract the function.
- Returns:
+ Returns
+ -------
A Pandas Index object.
"""
ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
@@ -525,32 +634,19 @@ def get_indices(cls, axis, partitions, index_func=None):
# TODO FIX INFORMATION LEAK!!!!1!!1!!
return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
- @classmethod
- def _compute_num_partitions(cls):
- """Currently, this method returns the default. In the future it will
- estimate the optimal number of partitions.
-
- :return:
- """
- from modin.pandas import DEFAULT_NPARTITIONS
-
- return DEFAULT_NPARTITIONS
-
@classmethod
def _apply_func_to_list_of_partitions_broadcast(
cls, func, partitions, other, **kwargs
):
preprocessed_func = cls.preprocess_func(func)
return [
- obj.add_to_apply_calls(
- preprocessed_func, other=[o.get() for o in broadcasted], **kwargs
- )
+ obj.apply(preprocessed_func, other=[o.get() for o in broadcasted], **kwargs)
for obj, broadcasted in zip(partitions, other.T)
]
@classmethod
def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs):
- """Applies a function to a list of remote partitions.
+ """Apply a function to a list of remote partitions.
Note: The main use for this is to preprocess the func.
@@ -558,19 +654,18 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs):
func: The func to apply
partitions: The list of partitions
- Returns:
+ Returns
+ -------
A list of BaseFramePartition objects.
"""
preprocessed_func = cls.preprocess_func(func)
- return [
- obj.add_to_apply_calls(preprocessed_func, **kwargs) for obj in partitions
- ]
+ return [obj.apply(preprocessed_func, **kwargs) for obj in partitions]
@classmethod
def apply_func_to_select_indices(
cls, axis, partitions, func, indices, keep_remaining=False
):
- """Applies a function to select indices.
+ """Apply a function to select indices.
Note: Your internal function must take a kwarg `internal_indices` for
this to work correctly. This prevents information leakage of the
@@ -584,7 +679,8 @@ def apply_func_to_select_indices(
Some operations may want to drop the remaining partitions and
keep only the results.
- Returns:
+ Returns
+ -------
A new BaseFrameManager object, the type of object that called this.
"""
if partitions.size == 0:
@@ -671,7 +767,7 @@ def apply_func_to_select_indices(
def apply_func_to_select_indices_along_full_axis(
cls, axis, partitions, func, indices, keep_remaining=False
):
- """Applies a function to a select subset of full columns/rows.
+ """Apply a function to a select subset of full columns/rows.
Note: This should be used when you need to apply a function that relies
on some global information for the entire column/row, but only need
@@ -680,15 +776,21 @@ def apply_func_to_select_indices_along_full_axis(
Important: For your func to operate directly on the indices provided,
it must use `internal_indices` as a keyword argument.
- Args:
- axis: The axis to apply the function over (0 - rows, 1 - columns)
- func: The function to apply.
- indices: The global indices to apply the func to.
- keep_remaining: Whether or not to keep the other partitions.
- Some operations may want to drop the remaining partitions and
- keep only the results.
+ Parameters
+ ----------
+ axis: int
+ The axis to apply the function over (0 - rows, 1 - columns)
+ func: callable
+ The function to apply.
+ indices: list-like
+ The global indices to apply the func to.
+ keep_remaining: boolean
+ Whether or not to keep the other partitions.
+ Some operations may want to drop the remaining partitions and
+ keep only the results.
- Returns:
+ Returns
+ -------
A new BaseFrameManager object, the type of object that called this.
"""
if partitions.size == 0:
@@ -780,7 +882,7 @@ def apply_func_to_indices_both_axis(
item_to_distribute=None,
):
"""
- Apply a function to along both axis
+ Apply a function to along both axis.
Important: For your func to operate directly on the indices provided,
it must use `row_internal_indices, col_internal_indices` as keyword
@@ -848,7 +950,7 @@ def binary_operation(cls, axis, left, func, right):
[
left_partitions[i].apply(
func,
- num_splits=cls._compute_num_partitions(),
+ num_splits=NPartitions.get(),
other_axis_partition=right_partitions[i],
)
for i in range(len(left_partitions))
diff --git a/modin/engines/base/io/__init__.py b/modin/engines/base/io/__init__.py
index e01ab740ff3..a8d9bd840ad 100644
--- a/modin/engines/base/io/__init__.py
+++ b/modin/engines/base/io/__init__.py
@@ -12,27 +12,29 @@
# governing permissions and limitations under the License.
from modin.engines.base.io.io import BaseIO
-from modin.engines.base.io.text.csv_reader import CSVReader
-from modin.engines.base.io.text.fwf_reader import FWFReader
-from modin.engines.base.io.text.json_reader import JSONReader
-from modin.engines.base.io.text.excel_reader import ExcelReader
-from modin.engines.base.io.file_reader import FileReader
-from modin.engines.base.io.text.text_file_reader import TextFileReader
-from modin.engines.base.io.column_stores.parquet_reader import ParquetReader
-from modin.engines.base.io.column_stores.hdf_reader import HDFReader
-from modin.engines.base.io.column_stores.feather_reader import FeatherReader
-from modin.engines.base.io.sql.sql_reader import SQLReader
+from modin.engines.base.io.text.csv_dispatcher import CSVDispatcher
+from modin.engines.base.io.text.csv_glob_dispatcher import CSVGlobDispatcher
+from modin.engines.base.io.text.fwf_dispatcher import FWFDispatcher
+from modin.engines.base.io.text.json_dispatcher import JSONDispatcher
+from modin.engines.base.io.text.excel_dispatcher import ExcelDispatcher
+from modin.engines.base.io.file_dispatcher import FileDispatcher
+from modin.engines.base.io.text.text_file_dispatcher import TextFileDispatcher
+from modin.engines.base.io.column_stores.parquet_dispatcher import ParquetDispatcher
+from modin.engines.base.io.column_stores.hdf_dispatcher import HDFDispatcher
+from modin.engines.base.io.column_stores.feather_dispatcher import FeatherDispatcher
+from modin.engines.base.io.sql.sql_dispatcher import SQLDispatcher
__all__ = [
"BaseIO",
- "CSVReader",
- "FWFReader",
- "JSONReader",
- "FileReader",
- "TextFileReader",
- "ParquetReader",
- "HDFReader",
- "FeatherReader",
- "SQLReader",
- "ExcelReader",
+ "CSVDispatcher",
+ "CSVGlobDispatcher",
+ "FWFDispatcher",
+ "JSONDispatcher",
+ "FileDispatcher",
+ "TextFileDispatcher",
+ "ParquetDispatcher",
+ "HDFDispatcher",
+ "FeatherDispatcher",
+ "SQLDispatcher",
+ "ExcelDispatcher",
]
diff --git a/modin/engines/base/io/column_stores/column_store_reader.py b/modin/engines/base/io/column_stores/column_store_dispatcher.py
similarity index 81%
rename from modin/engines/base/io/column_stores/column_store_reader.py
rename to modin/engines/base/io/column_stores/column_store_dispatcher.py
index e13cf8619f4..e166ea82ad5 100644
--- a/modin/engines/base/io/column_stores/column_store_reader.py
+++ b/modin/engines/base/io/column_stores/column_store_dispatcher.py
@@ -15,23 +15,22 @@
import pandas
from modin.data_management.utils import compute_chunksize
-from modin.engines.base.io.file_reader import FileReader
+from modin.engines.base.io.file_dispatcher import FileDispatcher
+from modin.config import NPartitions
-class ColumnStoreReader(FileReader):
+class ColumnStoreDispatcher(FileDispatcher):
@classmethod
def call_deploy(cls, fname, col_partitions, **kwargs):
- from modin.pandas import DEFAULT_NPARTITIONS
-
return np.array(
[
cls.deploy(
cls.parse,
- DEFAULT_NPARTITIONS + 2,
+ NPartitions.get() + 2,
dict(
fname=fname,
columns=cols,
- num_splits=DEFAULT_NPARTITIONS,
+ num_splits=NPartitions.get(),
**kwargs,
),
)
@@ -57,8 +56,7 @@ def build_partition(cls, partition_ids, row_lengths, column_widths):
@classmethod
def build_index(cls, partition_ids):
- from modin.pandas import DEFAULT_NPARTITIONS
-
+ num_partitions = NPartitions.get()
index_len = cls.materialize(partition_ids[-2][0])
if isinstance(index_len, int):
index = pandas.RangeIndex(index_len)
@@ -66,27 +64,26 @@ def build_index(cls, partition_ids):
index = index_len
index_len = len(index)
index_chunksize = compute_chunksize(
- pandas.DataFrame(index=index), DEFAULT_NPARTITIONS, axis=0
+ pandas.DataFrame(index=index), num_partitions, axis=0
)
if index_chunksize > index_len:
- row_lengths = [index_len] + [0 for _ in range(DEFAULT_NPARTITIONS - 1)]
+ row_lengths = [index_len] + [0 for _ in range(num_partitions - 1)]
else:
row_lengths = [
index_chunksize
- if i != DEFAULT_NPARTITIONS - 1
- else index_len - (index_chunksize * (DEFAULT_NPARTITIONS - 1))
- for i in range(DEFAULT_NPARTITIONS)
+ if i != num_partitions - 1
+ else index_len - (index_chunksize * (num_partitions - 1))
+ for i in range(num_partitions)
]
return index, row_lengths
@classmethod
def build_columns(cls, columns):
- from modin.pandas import DEFAULT_NPARTITIONS
-
+ num_partitions = NPartitions.get()
column_splits = (
- len(columns) // DEFAULT_NPARTITIONS
- if len(columns) % DEFAULT_NPARTITIONS == 0
- else len(columns) // DEFAULT_NPARTITIONS + 1
+ len(columns) // num_partitions
+ if len(columns) % num_partitions == 0
+ else len(columns) // num_partitions + 1
)
col_partitions = [
columns[i : i + column_splits]
diff --git a/modin/engines/base/io/column_stores/feather_reader.py b/modin/engines/base/io/column_stores/feather_dispatcher.py
similarity index 80%
rename from modin/engines/base/io/column_stores/feather_reader.py
rename to modin/engines/base/io/column_stores/feather_dispatcher.py
index 7b311b40f7b..be131dbf981 100644
--- a/modin/engines/base/io/column_stores/feather_reader.py
+++ b/modin/engines/base/io/column_stores/feather_dispatcher.py
@@ -11,10 +11,12 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
-from modin.engines.base.io.column_stores.column_store_reader import ColumnStoreReader
+from modin.engines.base.io.column_stores.column_store_dispatcher import (
+ ColumnStoreDispatcher,
+)
-class FeatherReader(ColumnStoreReader):
+class FeatherDispatcher(ColumnStoreDispatcher):
@classmethod
def _read(cls, path, columns=None, **kwargs):
"""Read data from the file path, returning a Modin DataFrame.
@@ -32,8 +34,7 @@ def _read(cls, path, columns=None, **kwargs):
https://arrow.apache.org/docs/python/api.html#feather-format
"""
if columns is None:
- from pyarrow.feather import FeatherReader
+ from pyarrow.feather import read_feather
- fr = FeatherReader(path)
- columns = [fr.get_column_name(i) for i in range(fr.num_columns)]
- return cls.build_query_compiler(path, columns, use_threads=False)
+ df = read_feather(path)
+ return cls.build_query_compiler(path, df.columns, use_threads=False)
diff --git a/modin/engines/base/io/column_stores/hdf_reader.py b/modin/engines/base/io/column_stores/hdf_dispatcher.py
similarity index 94%
rename from modin/engines/base/io/column_stores/hdf_reader.py
rename to modin/engines/base/io/column_stores/hdf_dispatcher.py
index 70f99adf5c3..2c4f2d773f4 100644
--- a/modin/engines/base/io/column_stores/hdf_reader.py
+++ b/modin/engines/base/io/column_stores/hdf_dispatcher.py
@@ -13,11 +13,13 @@
import pandas
-from modin.engines.base.io.column_stores.column_store_reader import ColumnStoreReader
+from modin.engines.base.io.column_stores.column_store_dispatcher import (
+ ColumnStoreDispatcher,
+)
from modin.error_message import ErrorMessage
-class HDFReader(ColumnStoreReader): # pragma: no cover
+class HDFDispatcher(ColumnStoreDispatcher): # pragma: no cover
@classmethod
def _validate_hdf_format(cls, path_or_buf):
s = pandas.HDFStore(path_or_buf)
diff --git a/modin/engines/base/io/column_stores/parquet_reader.py b/modin/engines/base/io/column_stores/parquet_dispatcher.py
similarity index 81%
rename from modin/engines/base/io/column_stores/parquet_reader.py
rename to modin/engines/base/io/column_stores/parquet_dispatcher.py
index 7c8ae96d565..7f9b1cef4bc 100644
--- a/modin/engines/base/io/column_stores/parquet_reader.py
+++ b/modin/engines/base/io/column_stores/parquet_dispatcher.py
@@ -13,11 +13,13 @@
import os
-from modin.engines.base.io.column_stores.column_store_reader import ColumnStoreReader
+from modin.engines.base.io.column_stores.column_store_dispatcher import (
+ ColumnStoreDispatcher,
+)
from modin.error_message import ErrorMessage
-class ParquetReader(ColumnStoreReader):
+class ParquetDispatcher(ColumnStoreDispatcher):
@classmethod
def _read(cls, path, engine, columns, **kwargs):
"""Load a parquet object from the file path, returning a Modin DataFrame.
@@ -48,7 +50,7 @@ def _read(cls, path, engine, columns, **kwargs):
from pyarrow.parquet import ParquetFile, ParquetDataset
from modin.pandas.io import PQ_INDEX_REGEX
- if os.path.isdir(path):
+ if isinstance(path, str) and os.path.isdir(path):
partitioned_columns = set()
directory = True
# We do a tree walk of the path directory because partitioned
@@ -72,6 +74,8 @@ def _read(cls, path, engine, columns, **kwargs):
else:
directory = False
if not columns:
+ import s3fs
+
if directory:
# Path of the sample file that we will read to get the remaining columns
pd = ParquetDataset(path)
@@ -84,6 +88,22 @@ def _read(cls, path, engine, columns, **kwargs):
pd = ParquetDataset(path, filesystem=fs)
meta = pd.metadata
column_names = pd.schema.names
+ elif isinstance(path, s3fs.S3File) or (
+ isinstance(path, str) and path.startswith("s3://")
+ ):
+ from botocore.exceptions import NoCredentialsError
+
+ if isinstance(path, s3fs.S3File):
+ bucket_path = path.url().split(".s3.amazonaws.com")
+ path = "s3://" + bucket_path[0].split("://")[1] + bucket_path[1]
+ try:
+ fs = s3fs.S3FileSystem()
+ pd = ParquetDataset(path, filesystem=fs)
+ except NoCredentialsError:
+ fs = s3fs.S3FileSystem(anon=True)
+ pd = ParquetDataset(path, filesystem=fs)
+ meta = pd.metadata
+ column_names = pd.schema.names
else:
meta = ParquetFile(path).metadata
column_names = meta.schema.names
diff --git a/modin/engines/base/io/file_reader.py b/modin/engines/base/io/file_dispatcher.py
similarity index 97%
rename from modin/engines/base/io/file_reader.py
rename to modin/engines/base/io/file_dispatcher.py
index 8a8ea6bd1ef..15fab8e48d1 100644
--- a/modin/engines/base/io/file_reader.py
+++ b/modin/engines/base/io/file_dispatcher.py
@@ -19,7 +19,7 @@
NOT_IMPLEMENTED_MESSAGE = "Implement in children classes!"
-class FileReader:
+class FileDispatcher:
frame_cls = None
frame_partition_cls = None
query_compiler_cls = None
@@ -137,10 +137,10 @@ def file_exists(cls, file_path):
return os.path.exists(file_path)
@classmethod
- def deploy(cls, func, args, num_return_vals):
+ def deploy(cls, func, args, num_returns):
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
- def parse(self, func, args, num_return_vals):
+ def parse(self, func, args, num_returns):
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
@classmethod
diff --git a/modin/engines/base/io/io.py b/modin/engines/base/io/io.py
index 5f28d2a635d..6b4f06ee58e 100644
--- a/modin/engines/base/io/io.py
+++ b/modin/engines/base/io/io.py
@@ -35,7 +35,7 @@ def from_arrow(cls, at):
return cls.query_compiler_cls.from_arrow(at, cls.frame_cls)
@classmethod
- def read_parquet(cls, path, engine, columns, **kwargs):
+ def read_parquet(cls, path, engine, columns, use_nullable_dtypes, **kwargs):
"""Load a parquet object from the file path, returning a Modin DataFrame.
Modin only supports pyarrow engine for now.
@@ -51,7 +51,9 @@ def read_parquet(cls, path, engine, columns, **kwargs):
https://arrow.apache.org/docs/python/parquet.html
"""
ErrorMessage.default_to_pandas("`read_parquet`")
- return cls.from_pandas(pandas.read_parquet(path, engine, columns, **kwargs))
+ return cls.from_pandas(
+ pandas.read_parquet(path, engine, columns, use_nullable_dtypes, **kwargs)
+ )
@classmethod
def read_csv(
@@ -105,6 +107,7 @@ def read_csv(
low_memory=True,
memory_map=False,
float_precision=None,
+ storage_options=None,
):
kwargs = {
"filepath_or_buffer": filepath_or_buffer,
@@ -156,6 +159,7 @@ def read_csv(
"low_memory": low_memory,
"memory_map": memory_map,
"float_precision": float_precision,
+ "storage_options": storage_options,
}
ErrorMessage.default_to_pandas("`read_csv`")
return cls._read(**kwargs)
@@ -199,6 +203,7 @@ def read_json(
chunksize=None,
compression="infer",
nrows: Optional[int] = None,
+ storage_options=None,
):
ErrorMessage.default_to_pandas("`read_json`")
kwargs = {
@@ -217,6 +222,7 @@ def read_json(
"chunksize": chunksize,
"compression": compression,
"nrows": nrows,
+ "storage_options": storage_options,
}
return cls.from_pandas(pandas.read_json(**kwargs))
@@ -407,10 +413,15 @@ def read_hdf(
)
@classmethod
- def read_feather(cls, path, columns=None, use_threads=True):
+ def read_feather(cls, path, columns=None, use_threads=True, storage_options=None):
ErrorMessage.default_to_pandas("`read_feather`")
return cls.from_pandas(
- pandas.read_feather(path, columns=columns, use_threads=use_threads)
+ pandas.read_feather(
+ path,
+ columns=columns,
+ use_threads=use_threads,
+ storage_options=storage_options,
+ )
)
@classmethod
@@ -426,6 +437,7 @@ def read_stata(
order_categoricals=True,
chunksize=None,
iterator=False,
+ storage_options=None,
):
ErrorMessage.default_to_pandas("`read_stata`")
kwargs = {
@@ -439,6 +451,7 @@ def read_stata(
"order_categoricals": order_categoricals,
"chunksize": chunksize,
"iterator": iterator,
+ "storage_options": storage_options,
}
return cls.from_pandas(pandas.read_stata(**kwargs))
@@ -465,10 +478,14 @@ def read_sas(
)
@classmethod
- def read_pickle(cls, filepath_or_buffer, compression="infer"):
+ def read_pickle(cls, filepath_or_buffer, compression="infer", storage_options=None):
ErrorMessage.default_to_pandas("`read_pickle`")
return cls.from_pandas(
- pandas.read_pickle(filepath_or_buffer, compression=compression)
+ pandas.read_pickle(
+ filepath_or_buffer,
+ compression=compression,
+ storage_options=storage_options,
+ )
)
@classmethod
diff --git a/modin/engines/base/io/sql/sql_reader.py b/modin/engines/base/io/sql/sql_dispatcher.py
similarity index 95%
rename from modin/engines/base/io/sql/sql_reader.py
rename to modin/engines/base/io/sql/sql_dispatcher.py
index 5a2ae552af3..67eb5c78638 100644
--- a/modin/engines/base/io/sql/sql_reader.py
+++ b/modin/engines/base/io/sql/sql_dispatcher.py
@@ -16,10 +16,11 @@
import pandas
import warnings
-from modin.engines.base.io.file_reader import FileReader
+from modin.engines.base.io.file_dispatcher import FileDispatcher
+from modin.config import NPartitions
-class SQLReader(FileReader):
+class SQLDispatcher(FileDispatcher):
@classmethod
def _read(cls, sql, con, index_col=None, **kwargs):
"""Reads a SQL query or database table into a DataFrame.
@@ -62,9 +63,7 @@ def _read(cls, sql, con, index_col=None, **kwargs):
"SELECT * FROM ({}) as foo LIMIT 0".format(sql), con, index_col=index_col
)
cols_names = cols_names_df.columns
- from modin.pandas import DEFAULT_NPARTITIONS
-
- num_partitions = DEFAULT_NPARTITIONS
+ num_partitions = NPartitions.get()
partition_ids = []
index_ids = []
dtype_ids = []
diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_dispatcher.py
similarity index 93%
rename from modin/engines/base/io/text/csv_reader.py
rename to modin/engines/base/io/text/csv_dispatcher.py
index 6243ef592e3..3c6d161d8b4 100644
--- a/modin/engines/base/io/text/csv_reader.py
+++ b/modin/engines/base/io/text/csv_dispatcher.py
@@ -11,17 +11,20 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
-from modin.engines.base.io.text.text_file_reader import TextFileReader
+from modin.engines.base.io.text.text_file_dispatcher import TextFileDispatcher
from modin.data_management.utils import compute_chunksize
from pandas.io.parsers import _validate_usecols_arg
import pandas
import csv
import sys
+from modin.config import NPartitions
-class CSVReader(TextFileReader):
+
+class CSVDispatcher(TextFileDispatcher):
@classmethod
def _read(cls, filepath_or_buffer, **kwargs):
+ filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer)
if isinstance(filepath_or_buffer, str):
if not cls.file_exists(filepath_or_buffer):
return cls.single_worker_read(filepath_or_buffer, **kwargs)
@@ -59,6 +62,7 @@ def _read(cls, filepath_or_buffer, **kwargs):
names = kwargs.get("names", None)
index_col = kwargs.get("index_col", None)
usecols = kwargs.get("usecols", None)
+ encoding = kwargs.get("encoding", None)
if names is None:
# For the sake of the empty df, we assume no `index_col` to get the correct
# column names before we build the index. Because we pass `names` in, this
@@ -71,7 +75,9 @@ def _read(cls, filepath_or_buffer, **kwargs):
elif index_col is None and not usecols:
# When names is set to some list that is smaller than the number of columns
# in the file, the first columns are built as a hierarchical index.
- empty_pd_df = pandas.read_csv(filepath_or_buffer, nrows=0)
+ empty_pd_df = pandas.read_csv(
+ filepath_or_buffer, nrows=0, encoding=encoding
+ )
num_cols = len(empty_pd_df.columns)
if num_cols > len(names):
index_col = list(range(num_cols - len(names)))
@@ -120,12 +126,6 @@ def _read(cls, filepath_or_buffer, **kwargs):
skiprows += header + 1
elif hasattr(header, "__iter__") and not isinstance(header, str):
skiprows += max(header) + 1
- cls.offset(
- f,
- nrows=skiprows,
- quotechar=quotechar,
- is_quoting=is_quoting,
- )
if kwargs.get("encoding", None) is not None:
partition_kwargs["skiprows"] = 1
# Launch tasks to read partitions
@@ -133,9 +133,7 @@ def _read(cls, filepath_or_buffer, **kwargs):
index_ids = []
dtypes_ids = []
# Max number of partitions available
- from modin.pandas import DEFAULT_NPARTITIONS
-
- num_partitions = DEFAULT_NPARTITIONS
+ num_partitions = NPartitions.get()
# This is the number of splits for the columns
num_splits = min(len(column_names), num_partitions)
# Metadata
@@ -163,8 +161,9 @@ def _read(cls, filepath_or_buffer, **kwargs):
splits = cls.partitioned_file(
f,
- nrows=nrows,
num_partitions=num_partitions,
+ nrows=nrows,
+ skiprows=skiprows,
quotechar=quotechar,
is_quoting=is_quoting,
)
@@ -180,11 +179,6 @@ def _read(cls, filepath_or_buffer, **kwargs):
if index_col is None:
row_lengths = cls.materialize(index_ids)
new_index = pandas.RangeIndex(sum(row_lengths))
- # pandas has a really weird edge case here.
- if kwargs.get("names", None) is not None and skiprows > 1:
- new_index = pandas.RangeIndex(
- skiprows - 1, new_index.stop + skiprows - 1
- )
else:
index_objs = cls.materialize(index_ids)
row_lengths = [len(o) for o in index_objs]
diff --git a/modin/engines/base/io/text/csv_glob_dispatcher.py b/modin/engines/base/io/text/csv_glob_dispatcher.py
new file mode 100644
index 00000000000..eb49c68670b
--- /dev/null
+++ b/modin/engines/base/io/text/csv_glob_dispatcher.py
@@ -0,0 +1,467 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+from contextlib import ExitStack
+import csv
+import glob
+import os
+import sys
+from typing import List, Tuple
+import warnings
+
+import pandas
+from pandas.io.parsers import _validate_usecols_arg
+
+from modin.config import NPartitions
+from modin.data_management.utils import compute_chunksize
+from modin.engines.base.io.file_dispatcher import S3_ADDRESS_REGEX
+from modin.engines.base.io.text.csv_dispatcher import CSVDispatcher
+
+
+class CSVGlobDispatcher(CSVDispatcher):
+ @classmethod
+ def _read(cls, filepath_or_buffer, **kwargs):
+ # Ensures that the file is a string file path. Otherwise, default to pandas.
+ filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer)
+ if isinstance(filepath_or_buffer, str):
+ if not cls.file_exists(filepath_or_buffer):
+ return cls.single_worker_read(filepath_or_buffer, **kwargs)
+ filepath_or_buffer = cls.get_path(filepath_or_buffer)
+ elif not cls.pathlib_or_pypath(filepath_or_buffer):
+ return cls.single_worker_read(filepath_or_buffer, **kwargs)
+
+ # We read multiple csv files when the file path is a list of absolute file paths. We assume that all of the files will be essentially replicas of the
+ # first file but with different data values.
+ glob_filepaths = filepath_or_buffer
+ filepath_or_buffer = filepath_or_buffer[0]
+
+ compression_type = cls.infer_compression(
+ filepath_or_buffer, kwargs.get("compression")
+ )
+ if compression_type is not None:
+ if (
+ compression_type == "gzip"
+ or compression_type == "bz2"
+ or compression_type == "xz"
+ ):
+ kwargs["compression"] = compression_type
+ elif (
+ compression_type == "zip"
+ and sys.version_info[0] == 3
+ and sys.version_info[1] >= 7
+ ):
+ # need python3.7 to .seek and .tell ZipExtFile
+ kwargs["compression"] = compression_type
+ else:
+ return cls.single_worker_read(filepath_or_buffer, **kwargs)
+
+ chunksize = kwargs.get("chunksize")
+ if chunksize is not None:
+ return cls.single_worker_read(filepath_or_buffer, **kwargs)
+
+ skiprows = kwargs.get("skiprows")
+ if skiprows is not None and not isinstance(skiprows, int):
+ return cls.single_worker_read(filepath_or_buffer, **kwargs)
+
+ nrows = kwargs.pop("nrows", None)
+ names = kwargs.get("names", None)
+ index_col = kwargs.get("index_col", None)
+ usecols = kwargs.get("usecols", None)
+ encoding = kwargs.get("encoding", None)
+ if names is None:
+ # For the sake of the empty df, we assume no `index_col` to get the correct
+ # column names before we build the index. Because we pass `names` in, this
+ # step has to happen without removing the `index_col` otherwise it will not
+ # be assigned correctly.
+ names = pandas.read_csv(
+ filepath_or_buffer,
+ **dict(kwargs, usecols=None, nrows=0, skipfooter=0, index_col=None),
+ ).columns
+ elif index_col is None and not usecols:
+ # When names is set to some list that is smaller than the number of columns
+ # in the file, the first columns are built as a hierarchical index.
+ empty_pd_df = pandas.read_csv(
+ filepath_or_buffer, nrows=0, encoding=encoding
+ )
+ num_cols = len(empty_pd_df.columns)
+ if num_cols > len(names):
+ index_col = list(range(num_cols - len(names)))
+ if len(index_col) == 1:
+ index_col = index_col[0]
+ kwargs["index_col"] = index_col
+ empty_pd_df = pandas.read_csv(
+ filepath_or_buffer, **dict(kwargs, nrows=0, skipfooter=0)
+ )
+ column_names = empty_pd_df.columns
+ skipfooter = kwargs.get("skipfooter", None)
+ skiprows = kwargs.pop("skiprows", None)
+ usecols_md = _validate_usecols_arg(usecols)
+ if usecols is not None and usecols_md[1] != "integer":
+ del kwargs["usecols"]
+ all_cols = pandas.read_csv(
+ cls.file_open(filepath_or_buffer, "rb"),
+ **dict(kwargs, nrows=0, skipfooter=0),
+ ).columns
+ usecols = all_cols.get_indexer_for(list(usecols_md[0]))
+ parse_dates = kwargs.pop("parse_dates", False)
+ partition_kwargs = dict(
+ kwargs,
+ header=None,
+ names=names,
+ skipfooter=0,
+ skiprows=None,
+ parse_dates=parse_dates,
+ usecols=usecols,
+ )
+ encoding = kwargs.get("encoding", None)
+ quotechar = kwargs.get("quotechar", '"').encode(
+ encoding if encoding is not None else "UTF-8"
+ )
+ is_quoting = kwargs.get("quoting", "") != csv.QUOTE_NONE
+
+ with ExitStack() as stack:
+ files = [
+ stack.enter_context(cls.file_open(fname, "rb", compression_type))
+ for fname in glob_filepaths
+ ]
+
+ # Skip the header since we already have the header information and skip the
+ # rows we are told to skip.
+ if isinstance(skiprows, int) or skiprows is None:
+ if skiprows is None:
+ skiprows = 0
+ header = kwargs.get("header", "infer")
+ if header == "infer" and kwargs.get("names", None) is None:
+ skip_header = 1
+ elif isinstance(header, int):
+ skip_header = header + 1
+ elif hasattr(header, "__iter__") and not isinstance(header, str):
+ skip_header = max(header) + 1
+ else:
+ skip_header = 0
+ if kwargs.get("encoding", None) is not None:
+ partition_kwargs["skiprows"] = 1
+ # Launch tasks to read partitions
+ partition_ids = []
+ index_ids = []
+ dtypes_ids = []
+ # Max number of partitions available
+ num_partitions = NPartitions.get()
+ # This is the number of splits for the columns
+ num_splits = min(len(column_names), num_partitions)
+ # Metadata
+ column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1)
+ if column_chunksize > len(column_names):
+ column_widths = [len(column_names)]
+ # This prevents us from unnecessarily serializing a bunch of empty
+ # objects.
+ num_splits = 1
+ else:
+ column_widths = [
+ column_chunksize
+ if len(column_names) > (column_chunksize * (i + 1))
+ else 0
+ if len(column_names) < (column_chunksize * i)
+ else len(column_names) - (column_chunksize * i)
+ for i in range(num_splits)
+ ]
+
+ args = {
+ "num_splits": num_splits,
+ **partition_kwargs,
+ }
+
+ splits = cls.partitioned_file(
+ files,
+ glob_filepaths,
+ num_partitions=num_partitions,
+ nrows=nrows,
+ skiprows=skiprows,
+ skip_header=skip_header,
+ quotechar=quotechar,
+ is_quoting=is_quoting,
+ )
+
+ for chunks in splits:
+ args.update({"chunks": chunks})
+ partition_id = cls.deploy(cls.parse, num_splits + 2, args)
+ partition_ids.append(partition_id[:-2])
+ index_ids.append(partition_id[-2])
+ dtypes_ids.append(partition_id[-1])
+
+ # Compute the index based on a sum of the lengths of each partition (by default)
+ # or based on the column(s) that were requested.
+ if index_col is None:
+ row_lengths = cls.materialize(index_ids)
+ new_index = pandas.RangeIndex(sum(row_lengths))
+ else:
+ index_objs = cls.materialize(index_ids)
+ row_lengths = [len(o) for o in index_objs]
+ new_index = index_objs[0].append(index_objs[1:])
+ new_index.name = empty_pd_df.index.name
+
+ # Compute dtypes by getting collecting and combining all of the partitions. The
+ # reported dtypes from differing rows can be different based on the inference in
+ # the limited data seen by each worker. We use pandas to compute the exact dtype
+ # over the whole column for each column. The index is set below.
+ dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
+
+ partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
+ # If parse_dates is present, the column names that we have might not be
+ # the same length as the returned column names. If we do need to modify
+ # the column names, we remove the old names from the column names and
+ # insert the new one at the front of the Index.
+ if parse_dates is not None:
+ # We have to recompute the column widths if `parse_dates` is set because
+ # we are not guaranteed to have the correct information regarding how many
+ # columns are on each partition.
+ column_widths = None
+ # Check if is list of lists
+ if isinstance(parse_dates, list) and isinstance(parse_dates[0], list):
+ for group in parse_dates:
+ new_col_name = "_".join(group)
+ column_names = column_names.drop(group).insert(0, new_col_name)
+ # Check if it is a dictionary
+ elif isinstance(parse_dates, dict):
+ for new_col_name, group in parse_dates.items():
+ column_names = column_names.drop(group).insert(0, new_col_name)
+ # Set the index for the dtypes to the column names
+ if isinstance(dtypes, pandas.Series):
+ dtypes.index = column_names
+ else:
+ dtypes = pandas.Series(dtypes, index=column_names)
+ new_frame = cls.frame_cls(
+ partition_ids,
+ new_index,
+ column_names,
+ row_lengths,
+ column_widths,
+ dtypes=dtypes,
+ )
+ new_query_compiler = cls.query_compiler_cls(new_frame)
+
+ if skipfooter:
+ new_query_compiler = new_query_compiler.drop(
+ new_query_compiler.index[-skipfooter:]
+ )
+ if kwargs.get("squeeze", False) and len(new_query_compiler.columns) == 1:
+ return new_query_compiler[new_query_compiler.columns[0]]
+ if index_col is None:
+ new_query_compiler._modin_frame._apply_index_objs(axis=0)
+ return new_query_compiler
+
+ @classmethod
+ def file_exists(cls, file_path: str) -> bool:
+ """
+ Checks if the file_path is valid.
+
+ Parameters
+ ----------
+ file_path: str
+ String representing a path.
+
+ Returns
+ -------
+ bool
+ True if the glob path is valid.
+ """
+ if isinstance(file_path, str):
+ match = S3_ADDRESS_REGEX.search(file_path)
+ if match is not None:
+ if file_path[0] == "S":
+ file_path = "{}{}".format("s", file_path[1:])
+ import s3fs as S3FS
+ from botocore.exceptions import NoCredentialsError
+
+ s3fs = S3FS.S3FileSystem(anon=False)
+ exists = False
+ try:
+ exists = len(s3fs.glob(file_path)) > 0 or exists
+ except NoCredentialsError:
+ pass
+ s3fs = S3FS.S3FileSystem(anon=True)
+ return exists or len(s3fs.glob(file_path)) > 0
+ return len(glob.glob(file_path)) > 0
+
+ @classmethod
+ def get_path(cls, file_path: str) -> list:
+ """
+ Returns the path of the file(s).
+
+ Parameters
+ ----------
+ file_path: str
+ String representing a path.
+
+ Returns
+ -------
+ list
+ List of strings of absolute file paths.
+ """
+ if S3_ADDRESS_REGEX.search(file_path):
+ # S3FS does not allow captial S in s3 addresses.
+ if file_path[0] == "S":
+ file_path = "{}{}".format("s", file_path[1:])
+
+ import s3fs as S3FS
+ from botocore.exceptions import NoCredentialsError
+
+ def get_file_path(fs_handle) -> List[str]:
+ file_paths = fs_handle.glob(file_path)
+ s3_addresses = ["{}{}".format("s3://", path) for path in file_paths]
+ return s3_addresses
+
+ s3fs = S3FS.S3FileSystem(anon=False)
+ try:
+ return get_file_path(s3fs)
+ except NoCredentialsError:
+ pass
+ s3fs = S3FS.S3FileSystem(anon=True)
+ return get_file_path(s3fs)
+ else:
+ relative_paths = glob.glob(file_path)
+ abs_paths = [os.path.abspath(path) for path in relative_paths]
+ return abs_paths
+
+ @classmethod
+ def partitioned_file(
+ cls,
+ files,
+ fnames: List[str],
+ num_partitions: int = None,
+ nrows: int = None,
+ skiprows: int = None,
+ skip_header: int = None,
+ quotechar: bytes = b'"',
+ is_quoting: bool = True,
+ ) -> List[List[Tuple[str, int, int]]]:
+ """
+ Compute chunk sizes in bytes for every partition.
+
+ Parameters
+ ----------
+ files: file or list of files
+ File(s) to be partitioned.
+ fnames: str or list of str
+ File name(s) to be partitioned.
+ num_partitions: int, optional
+ For what number of partitions split a file.
+ If not specified grabs the value from `modin.config.NPartitions.get()`.
+ nrows: int, optional
+ Number of rows of file to read.
+ skiprows: int, optional
+ Specifies rows to skip.
+ skip_header: int, optional
+ Specifies header rows to skip.
+ quotechar: bytes, default b'"'
+ Indicate quote in a file.
+ is_quoting: bool, default True
+ Whether or not to consider quotes.
+
+ Notes
+ -----
+ The logic gets really complicated if we try to use the TextFileDispatcher.partitioned_file().
+
+ Returns
+ -------
+ list
+ List, where each element of the list is a list of tuples. The inner lists
+ of tuples contains the data file name of the chunk, chunk start offset, and chunk end offsets for its corresponding file.
+ """
+ if type(files) != list:
+ files = [files]
+
+ if num_partitions is None:
+ num_partitions = NPartitions.get()
+
+ file_sizes = [cls.file_size(f) for f in files]
+ partition_size = max(
+ 1, num_partitions, (nrows if nrows else sum(file_sizes)) // num_partitions
+ )
+
+ result = []
+ split_result = []
+ split_size = 0
+ read_rows_counter = 0
+ for f, fname, f_size in zip(files, fnames, file_sizes):
+ if skiprows or skip_header:
+ skip_amount = (skiprows if skiprows else 0) + (
+ skip_header if skip_header else 0
+ )
+
+ # TODO(williamma12): Handle when skiprows > number of rows in file. Currently returns empty df.
+ outside_quotes, read_rows = cls._read_rows(
+ f,
+ nrows=skip_amount,
+ quotechar=quotechar,
+ is_quoting=is_quoting,
+ )
+ if skiprows:
+ skiprows -= read_rows
+ if skiprows > 0:
+ # We have more rows to skip than the amount read in the file.
+ continue
+
+ start = f.tell()
+
+ while f.tell() < f_size:
+ if split_size >= partition_size:
+ # Create a new split when the split has reached partition_size.
+ # This is mainly used when we are reading row-wise partitioned files.
+ result.append(split_result)
+ split_result = []
+ split_size = 0
+
+ # We calculate the amount that we need to read based off of how much of the split we have already read.
+ read_size = partition_size - split_size
+
+ if nrows:
+ if read_rows_counter >= nrows:
+ # # Finish when we have read enough rows.
+ if len(split_result) > 0:
+ # Add last split into the result.
+ result.append(split_result)
+ return result
+ elif read_rows_counter + read_size > nrows:
+ # Ensure that we will not read more than nrows.
+ read_size = nrows - read_rows_counter
+
+ outside_quotes, read_rows = cls._read_rows(
+ f,
+ nrows=read_size,
+ quotechar=quotechar,
+ is_quoting=is_quoting,
+ )
+ split_size += read_rows
+ read_rows_counter += read_rows
+ else:
+ outside_quotes = cls.offset(
+ f,
+ offset_size=read_size,
+ quotechar=quotechar,
+ is_quoting=is_quoting,
+ )
+
+ split_result.append((fname, start, f.tell()))
+ split_size += f.tell() - start
+ start = f.tell()
+
+ # Add outside_quotes.
+ if is_quoting and not outside_quotes:
+ warnings.warn("File has mismatched quotes")
+
+ # Add last split into the result.
+ if len(split_result) > 0:
+ result.append(split_result)
+
+ return result
diff --git a/modin/engines/base/io/text/excel_reader.py b/modin/engines/base/io/text/excel_dispatcher.py
similarity index 74%
rename from modin/engines/base/io/text/excel_reader.py
rename to modin/engines/base/io/text/excel_dispatcher.py
index b645757cf16..26bf423a012 100644
--- a/modin/engines/base/io/text/excel_reader.py
+++ b/modin/engines/base/io/text/excel_dispatcher.py
@@ -17,13 +17,13 @@
import warnings
from modin.data_management.utils import compute_chunksize
-from modin.engines.base.io.text.text_file_reader import TextFileReader
-
+from modin.engines.base.io.text.text_file_dispatcher import TextFileDispatcher
+from modin.config import NPartitions
EXCEL_READ_BLOCK_SIZE = 4096
-class ExcelReader(TextFileReader):
+class ExcelDispatcher(TextFileDispatcher):
@classmethod
def _read(cls, io, **kwargs):
if (
@@ -41,7 +41,6 @@ def _read(cls, io, **kwargs):
return cls.single_worker_read(io, **kwargs)
from zipfile import ZipFile
- from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.worksheet._reader import WorksheetReader
from openpyxl.reader.excel import ExcelReader
@@ -59,30 +58,47 @@ def _read(cls, io, **kwargs):
"Parallel `read_excel` is a new feature! Please email "
"bug_reports@modin.org if you run into any problems."
)
- wb = load_workbook(filename=io, read_only=True)
- # Get shared strings
- ex = ExcelReader(io, read_only=True)
- ex.read_manifest()
- ex.read_strings()
- ws = Worksheet(wb)
- # Convert index to sheet name in file
- if isinstance(sheet_name, int):
- sheet_name = "sheet{}".format(sheet_name + 1)
- else:
- sheet_name = "sheet{}".format(wb.sheetnames.index(sheet_name) + 1)
- # Pass this value to the workers
- kwargs["sheet_name"] = sheet_name
+ # NOTE: ExcelReader() in read-only mode does not close file handle by itself
+ # work around that by passing file object if we received some path
+ io_file = open(io, "rb") if isinstance(io, str) else io
+ try:
+ ex = ExcelReader(io_file, read_only=True)
+ ex.read()
+ wb = ex.wb
+
+ # Get shared strings
+ ex.read_manifest()
+ ex.read_strings()
+ ws = Worksheet(wb)
+ finally:
+ if isinstance(io, str):
+ # close only if it were us who opened the object
+ io_file.close()
+
+ pandas_kw = dict(kwargs) # preserve original kwargs
with ZipFile(io) as z:
from io import BytesIO
- f = z.open("xl/worksheets/{}.xml".format(sheet_name.lower()))
+ # Convert index to sheet name in file
+ if isinstance(sheet_name, int):
+ sheet_name = "sheet{}".format(sheet_name + 1)
+ else:
+ sheet_name = "sheet{}".format(wb.sheetnames.index(sheet_name) + 1)
+ if any(sheet_name.lower() in name for name in z.namelist()):
+ sheet_name = sheet_name.lower()
+ elif any(sheet_name.title() in name for name in z.namelist()):
+ sheet_name = sheet_name.title()
+ else:
+ raise ValueError("Sheet {} not found".format(sheet_name.lower()))
+ # Pass this value to the workers
+ kwargs["sheet_name"] = sheet_name
+
+ f = z.open("xl/worksheets/{}.xml".format(sheet_name))
f = BytesIO(f.read())
total_bytes = cls.file_size(f)
- from modin.pandas import DEFAULT_NPARTITIONS
-
- num_partitions = DEFAULT_NPARTITIONS
+ num_partitions = NPartitions.get()
# Read some bytes from the sheet so we can extract the XML header and first
# line. We need to make sure we get the first line of the data as well
# because that is where the column names are. The header information will
@@ -112,13 +128,20 @@ def _read(cls, io, **kwargs):
# Remove column names that are specified as `index_col`
if index_col is not None:
column_names = column_names.drop(column_names[index_col])
+
+ if not all(column_names):
+ # some column names are empty, use pandas reader to take the names from it
+ pandas_kw["nrows"] = 1
+ df = pandas.read_excel(io, **pandas_kw)
+ column_names = df.columns
+
# Compute partition metadata upfront so it is uniform for all partitions
chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)
num_splits = min(len(column_names), num_partitions)
kwargs["fname"] = io
# Skiprows will be used to inform a partition how many rows come before it.
kwargs["skiprows"] = 0
- row_count = 0
+ rows_to_skip = 0
data_ids = []
index_ids = []
dtypes_ids = []
@@ -145,7 +168,7 @@ def _read(cls, io, **kwargs):
while f.tell() < total_bytes:
args = kwargs
- args["skiprows"] = row_count + args["skiprows"]
+ args["skiprows"] = rows_to_skip
args["start"] = f.tell()
chunk = f.read(chunk_size)
# This edge case can happen when we have reached the end of the data
@@ -167,6 +190,21 @@ def _read(cls, io, **kwargs):
# If there is no data, exit before triggering computation.
if b"" not in chunk and b"" in chunk:
break
+ # We need to make sure we include all rows, even those that have no
+ # data. Getting the number of the last row will turn into the number of
+ # skipped rows, so if there are any rows missing between the last row
+ # seen here and the first row the next partition reads, the parser will
+ # have to include those rows in that specific partition to match the
+ # expected behavior. We subtract 1 here because the header is included
+ # in the skip values, and we do not want to skip the header.
+ rows_to_skip = (
+ int(
+ chunk[: last_index + len(row_close_tag)]
+ .split(b' nrows:
+ # it's possible only if is_quoting==True
+ partition_size = nrows - read_rows_counter
+ outside_quotes, read_rows = cls._read_rows(
+ f,
+ nrows=partition_size,
+ quotechar=quotechar,
+ is_quoting=is_quoting,
+ )
+ result.append((start, f.tell()))
+ start = f.tell()
+ read_rows_counter += read_rows
+
+ # add outside_quotes
+ if is_quoting and not outside_quotes:
+ warnings.warn("File has mismatched quotes")
+ else:
+ partition_size = max(1, num_partitions, file_size // num_partitions)
+ while f.tell() < file_size:
+ outside_quotes = cls.offset(
+ f,
+ offset_size=partition_size,
+ quotechar=quotechar,
+ is_quoting=is_quoting,
+ )
+
+ result.append((start, f.tell()))
+ start = f.tell()
+
+ # add outside_quotes
+ if is_quoting and not outside_quotes:
+ warnings.warn("File has mismatched quotes")
+
+ return result
+
+ @classmethod
+ def _read_rows(
+ cls,
+ f,
+ nrows: int,
+ quotechar: bytes = b'"',
+ is_quoting: bool = True,
+ outside_quotes: bool = True,
+ ):
+ """
+ Move the file offset at the specified amount of rows.
+
+ Parameters
+ ----------
+ f: file object
+ nrows: int
+ Number of rows to read.
+ quotechar: bytes, default b'"'
+ Indicate quote in a file.
+ is_quoting: bool, default True
+ Whether or not to consider quotes.
+ outside_quotes: bool, default True
+ Whether the file pointer is within quotes or not at the time this function is called.
+
+ Returns
+ -------
+ tuple of bool and int,
+ bool: If file pointer reached the end of the file, but did not find
+ closing quote returns `False`. `True` in any other case.
+ int: Number of rows that was read.
+ """
+ if nrows is not None and nrows <= 0:
+ return True, 0
+
+ rows_read = 0
+
+ for line in f:
+ if is_quoting and line.count(quotechar) % 2:
+ outside_quotes = not outside_quotes
+ if outside_quotes:
+ rows_read += 1
+ if rows_read >= nrows:
+ break
+
+ # case when EOF
+ if not outside_quotes:
+ rows_read += 1
+
+ return outside_quotes, rows_read
diff --git a/modin/engines/base/io/text/text_file_reader.py b/modin/engines/base/io/text/text_file_reader.py
deleted file mode 100644
index bc194cc2986..00000000000
--- a/modin/engines/base/io/text/text_file_reader.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Licensed to Modin Development Team under one or more contributor license agreements.
-# See the NOTICE file distributed with this work for additional information regarding
-# copyright ownership. The Modin Development Team licenses this file to you under the
-# Apache License, Version 2.0 (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under
-# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific language
-# governing permissions and limitations under the License.
-
-from modin.engines.base.io.file_reader import FileReader
-import numpy as np
-import warnings
-import os
-
-
-class TextFileReader(FileReader):
- @classmethod
- def build_partition(cls, partition_ids, row_lengths, column_widths):
- return np.array(
- [
- [
- cls.frame_partition_cls(
- partition_ids[i][j],
- length=row_lengths[i],
- width=column_widths[j],
- )
- for j in range(len(partition_ids[i]))
- ]
- for i in range(len(partition_ids))
- ]
- )
-
- @classmethod
- def pathlib_or_pypath(cls, filepath_or_buffer):
- try:
- import py
-
- if isinstance(filepath_or_buffer, py.path.local):
- return True
- except ImportError: # pragma: no cover
- pass
- try:
- import pathlib
-
- if isinstance(filepath_or_buffer, pathlib.Path):
- return True
- except ImportError: # pragma: no cover
- pass
- return False
-
- @classmethod
- def offset(
- cls,
- f,
- nrows=None,
- skiprows=None,
- chunk_size_bytes=None,
- quotechar=b'"',
- is_quoting=True,
- ):
- """
- Moves the file offset at the specified amount of bytes/rows.
-
- Parameters
- ----------
- f: file object
- nrows: int, number of rows to read. Optional, if not specified will only
- consider `chunk_size_bytes` parameter.
- chunk_size_bytes: int, Will read new rows while file pointer
- is less than `chunk_size_bytes`. Optional, if not specified will only
- consider `nrows` parameter.
- skiprows: array or callable (optional), specifies rows to skip
- quotechar: char that indicates quote in a file
- (optional, by default it's '\"')
- is_quoting: bool, Whether or not to consider quotes
- (optional, by default it's `True`)
-
- Returns
- -------
- bool: If file pointer reached the end of the file, but did not find
- closing quote returns `False`. `True` in any other case.
- """
- assert (
- nrows is not None or chunk_size_bytes is not None
- ), "`nrows` and `chunk_size_bytes` can't be None at the same time"
-
- if nrows is not None or skiprows is not None:
- return cls._read_rows(
- f,
- nrows=nrows,
- skiprows=skiprows,
- quotechar=quotechar,
- is_quoting=is_quoting,
- max_bytes=chunk_size_bytes,
- )[0]
-
- outside_quotes = True
-
- if is_quoting:
- chunk = f.read(chunk_size_bytes)
- line = f.readline() # Ensure we read up to a newline
- # We need to ensure that one row isn't split across different partitions
- outside_quotes = not ((chunk.count(quotechar) + line.count(quotechar)) % 2)
- while not outside_quotes:
- line = f.readline()
- outside_quotes = line.count(quotechar) % 2
- if not line:
- break
- else:
- f.seek(chunk_size_bytes, os.SEEK_CUR)
- f.readline()
- return outside_quotes
-
- @classmethod
- def partitioned_file(
- cls,
- f,
- nrows=None,
- skiprows=None,
- num_partitions=None,
- quotechar=b'"',
- is_quoting=True,
- from_begin=False,
- ):
- """Computes chunk sizes in bytes for every partition.
-
- Parameters
- ----------
- f: file to be partitioned
- nrows: int (optional), number of rows of file to read
- skiprows: array or callable (optional), specifies rows to skip
- num_partitions: int, for what number of partitions split a file.
- Optional, if not specified grabs the value from `modin.pandas.DEFAULT_NPARTITIONS`
- quotechar: char that indicates quote in a file
- (optional, by default it's '\"')
- is_quoting: bool, Whether or not to consider quotes
- (optional, by default it's `True`)
- from_begin: bool, Whether or not to set the file pointer to the begining of the file
- (optional, by default it's `False`)
-
- Returns
- -------
- An array, where each element of array is a tuple of two ints:
- beginning and the end offsets of the current chunk.
- """
- if num_partitions is None:
- from modin.pandas import DEFAULT_NPARTITIONS
-
- num_partitions = DEFAULT_NPARTITIONS
-
- result = []
-
- old_position = f.tell()
- if from_begin:
- f.seek(0, os.SEEK_SET)
-
- current_start = f.tell()
- total_bytes = cls.file_size(f)
-
- # if `nrows` are specified we want to use rows as a part measure
- if nrows is not None:
- chunk_size_bytes = None
- rows_per_part = max(1, num_partitions, nrows // num_partitions)
- else:
- chunk_size_bytes = max(1, num_partitions, total_bytes // num_partitions)
- rows_per_part = None
- nrows = float("inf")
-
- rows_readed = 0
- while f.tell() < total_bytes and rows_readed < nrows:
- if rows_per_part is not None and rows_readed + rows_per_part > nrows:
- rows_per_part = nrows - rows_readed
-
- outside_quotes = cls.offset(
- f,
- nrows=rows_per_part,
- skiprows=skiprows,
- chunk_size_bytes=chunk_size_bytes,
- quotechar=quotechar,
- is_quoting=is_quoting,
- )
-
- result.append((current_start, f.tell()))
- current_start = f.tell()
- if rows_per_part is not None:
- rows_readed += rows_per_part
-
- if is_quoting and not outside_quotes:
- warnings.warn("File has mismatched quotes")
-
- f.seek(old_position, os.SEEK_SET)
-
- return result
-
- @classmethod
- def _read_rows(
- cls,
- f,
- nrows=None,
- skiprows=None,
- quotechar=b'"',
- is_quoting=True,
- max_bytes=None,
- ):
- """
- Moves the file offset at the specified amount of rows
- Note: the difference between `offset` is that `_read_rows` is more
- specific version of `offset` which is focused of reading **rows**.
- In common case it's better to use `offset`.
-
- Parameters
- ----------
- f: file object
- nrows: int, number of rows to read. Optional, if not specified will only
- consider `max_bytes` parameter.
- skiprows: int, array or callable (optional), specifies rows to skip
- quotechar: char that indicates quote in a file
- (optional, by default it's '\"')
- is_quoting: bool, Whether or not to consider quotes
- (optional, by default it's `True`)
- max_bytes: int, Will read new rows while file pointer
- is less than `max_bytes`. Optional, if not specified will only
- consider `nrows` parameter, if both not specified will read till
- the end of the file.
-
- Returns
- -------
- tuple of bool and int,
- bool: If file pointer reached the end of the file, but did not find
- closing quote returns `False`. `True` in any other case.
- int: Number of rows that was readed.
- """
- assert skiprows is None or isinstance(
- skiprows, int
- ), f"Skiprows as a {type(skiprows)} is not supported yet."
-
- if nrows is None and max_bytes is None:
- max_bytes = float("inf")
-
- if nrows is not None and nrows <= 0:
- return True, 0
-
- # we need this condition to avoid unnecessary checks in `stop_condition`
- # which executes in a huge for loop
- if nrows is not None and max_bytes is None:
- stop_condition = lambda rows_readed: rows_readed >= nrows # noqa (E731)
- elif nrows is not None and max_bytes is not None:
- stop_condition = (
- lambda rows_readed: f.tell() >= max_bytes or rows_readed >= nrows
- ) # noqa (E731)
- else:
- stop_condition = lambda rows_readed: f.tell() >= max_bytes # noqa (E731)
-
- if max_bytes is not None:
- max_bytes = max_bytes + f.tell()
-
- rows_readed = 0
- outside_quotes = True
- for line in f:
- if is_quoting and line.count(quotechar) % 2:
- outside_quotes = not outside_quotes
- if outside_quotes:
- rows_readed += 1
- if stop_condition(rows_readed):
- break
-
- if not outside_quotes:
- rows_readed += 1
-
- return outside_quotes, rows_readed
diff --git a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py
index b3f98a0fb65..8ee82996cc8 100644
--- a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py
+++ b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py
@@ -16,14 +16,18 @@
from distributed.client import get_client
from distributed import Future
+from distributed.utils import get_ip
+import pandas
class PandasOnDaskFrameAxisPartition(PandasFrameAxisPartition):
- def __init__(self, list_of_blocks):
+ def __init__(self, list_of_blocks, bind_ip=False):
# Unwrap from BaseFramePartition object for ease of use
for obj in list_of_blocks:
obj.drain_call_queue()
self.list_of_blocks = [obj.future for obj in list_of_blocks]
+ if bind_ip:
+ self.list_of_ips = [obj.ip for obj in list_of_blocks]
partition_type = PandasOnDaskFramePartition
instance_type = Future
@@ -34,6 +38,7 @@ def deploy_axis_func(
):
client = get_client()
axis_result = client.submit(
+ deploy_dask_func,
PandasFrameAxisPartition.deploy_axis_func,
axis,
func,
@@ -43,13 +48,15 @@ def deploy_axis_func(
*partitions,
pure=False,
)
- if num_splits == 1:
- return axis_result
+
+ lengths = kwargs.get("_lengths", None)
+ result_num_splits = len(lengths) if lengths else num_splits
+
# We have to do this to split it back up. It is already split, but we need to
# get futures for each.
return [
client.submit(lambda l: l[i], axis_result, pure=False)
- for i in range(num_splits)
+ for i in range(result_num_splits * 4)
]
@classmethod
@@ -58,6 +65,7 @@ def deploy_func_between_two_axis_partitions(
):
client = get_client()
axis_result = client.submit(
+ deploy_dask_func,
PandasFrameAxisPartition.deploy_func_between_two_axis_partitions,
axis,
func,
@@ -68,13 +76,17 @@ def deploy_func_between_two_axis_partitions(
*partitions,
pure=False,
)
- if num_splits == 1:
- return axis_result
# We have to do this to split it back up. It is already split, but we need to
# get futures for each.
return [
client.submit(lambda l: l[i], axis_result, pure=False)
- for i in range(num_splits)
+ for i in range(num_splits * 4)
+ ]
+
+ def _wrap_partitions(self, partitions):
+ return [
+ self.partition_type(future, length, width, ip)
+ for (future, length, width, ip) in zip(*[iter(partitions)] * 4)
]
@@ -94,3 +106,26 @@ class PandasOnDaskFrameRowPartition(PandasOnDaskFrameAxisPartition):
"""
axis = 1
+
+
+def deploy_dask_func(func, *args):
+ """
+ Run a function on a remote partition.
+
+ Parameters
+ ----------
+ func : callable
+ The function to run.
+
+ Returns
+ -------
+ The result of the function `func`.
+ """
+ result = func(*args)
+ ip = get_ip()
+ if isinstance(result, pandas.DataFrame):
+ return result, len(result), len(result.columns), ip
+ elif all(isinstance(r, pandas.DataFrame) for r in result):
+ return [i for r in result for i in [r, len(r), len(r.columns), ip]]
+ else:
+ return [i for r in result for i in [r, None, None, ip]]
diff --git a/modin/engines/dask/pandas_on_dask/frame/partition.py b/modin/engines/dask/pandas_on_dask/frame/partition.py
index c3686965ca7..bbc16a27e19 100644
--- a/modin/engines/dask/pandas_on_dask/frame/partition.py
+++ b/modin/engines/dask/pandas_on_dask/frame/partition.py
@@ -17,6 +17,7 @@
from modin.data_management.utils import length_fn_pandas, width_fn_pandas
from distributed.client import get_client
+from distributed.utils import get_ip
import cloudpickle as pkl
@@ -25,7 +26,7 @@ def apply_list_of_funcs(funcs, df):
if isinstance(func, bytes):
func = pkl.loads(func)
df = func(df, **kwargs)
- return df
+ return df, get_ip()
class PandasOnDaskFramePartition(BaseFramePartition):
@@ -40,13 +41,14 @@ class PandasOnDaskFramePartition(BaseFramePartition):
subclasses. There is no logic for updating inplace.
"""
- def __init__(self, future, length=None, width=None, call_queue=None):
+ def __init__(self, future, length=None, width=None, ip=None, call_queue=None):
self.future = future
if call_queue is None:
call_queue = []
self.call_queue = call_queue
self._length_cache = length
self._width_cache = width
+ self.ip = ip
def get(self):
"""Flushes the call_queue and returns the data.
@@ -81,7 +83,8 @@ def apply(self, func, **kwargs):
future = get_client().submit(
apply_list_of_funcs, call_queue, self.future, pure=False
)
- return PandasOnDaskFramePartition(future)
+ futures = [get_client().submit(lambda l: l[i], future) for i in range(2)]
+ return PandasOnDaskFramePartition(futures[0], ip=futures[1])
def add_to_apply_calls(self, func, **kwargs):
return PandasOnDaskFramePartition(
@@ -91,7 +94,9 @@ def add_to_apply_calls(self, func, **kwargs):
def drain_call_queue(self):
if len(self.call_queue) == 0:
return
- self.future = self.apply(lambda x: x).future
+ new_partition = self.apply(lambda x: x)
+ self.future = new_partition.future
+ self.ip = new_partition.ip
self.call_queue = []
def mask(self, row_indices, col_indices):
@@ -125,7 +130,7 @@ def to_pandas(self):
def to_numpy(self, **kwargs):
"""
- Convert the object stored in this parition to a NumPy array.
+ Convert the object stored in this partition to a NumPy array.
Returns
-------
diff --git a/modin/engines/dask/pandas_on_dask/io.py b/modin/engines/dask/pandas_on_dask/io.py
index 127f96e727f..91b59b561e2 100644
--- a/modin/engines/dask/pandas_on_dask/io.py
+++ b/modin/engines/dask/pandas_on_dask/io.py
@@ -16,12 +16,12 @@
from modin.engines.dask.pandas_on_dask.frame.data import PandasOnDaskFrame
from modin.engines.dask.pandas_on_dask.frame.partition import PandasOnDaskFramePartition
from modin.engines.base.io import (
- CSVReader,
- JSONReader,
- ParquetReader,
- FeatherReader,
- SQLReader,
- ExcelReader,
+ CSVDispatcher,
+ JSONDispatcher,
+ ParquetDispatcher,
+ FeatherDispatcher,
+ SQLDispatcher,
+ ExcelDispatcher,
)
from modin.backends.pandas.parsers import (
PandasCSVParser,
@@ -44,15 +44,17 @@ class PandasOnDaskIO(BaseIO):
query_compiler_cls=PandasQueryCompiler,
)
- read_csv = type("", (DaskTask, PandasCSVParser, CSVReader), build_args).read
- read_json = type("", (DaskTask, PandasJSONParser, JSONReader), build_args).read
+ read_csv = type("", (DaskTask, PandasCSVParser, CSVDispatcher), build_args).read
+ read_json = type("", (DaskTask, PandasJSONParser, JSONDispatcher), build_args).read
read_parquet = type(
- "", (DaskTask, PandasParquetParser, ParquetReader), build_args
+ "", (DaskTask, PandasParquetParser, ParquetDispatcher), build_args
).read
# Blocked on pandas-dev/pandas#12236. It is faster to default to pandas.
# read_hdf = type("", (DaskTask, PandasHDFParser, HDFReader), build_args).read
read_feather = type(
- "", (DaskTask, PandasFeatherParser, FeatherReader), build_args
+ "", (DaskTask, PandasFeatherParser, FeatherDispatcher), build_args
+ ).read
+ read_sql = type("", (DaskTask, PandasSQLParser, SQLDispatcher), build_args).read
+ read_excel = type(
+ "", (DaskTask, PandasExcelParser, ExcelDispatcher), build_args
).read
- read_sql = type("", (DaskTask, PandasSQLParser, SQLReader), build_args).read
- read_excel = type("", (DaskTask, PandasExcelParser, ExcelReader), build_args).read
diff --git a/modin/engines/dask/task_wrapper.py b/modin/engines/dask/task_wrapper.py
index 04e5ed2a3b9..af717625afe 100644
--- a/modin/engines/dask/task_wrapper.py
+++ b/modin/engines/dask/task_wrapper.py
@@ -16,12 +16,12 @@
class DaskTask:
@classmethod
- def deploy(cls, func, num_return_vals, kwargs):
+ def deploy(cls, func, num_returns, kwargs):
client = _get_global_client()
remote_task_future = client.submit(func, **kwargs)
return [
client.submit(lambda l, i: l[i], remote_task_future, i)
- for i in range(num_return_vals)
+ for i in range(num_returns)
]
@classmethod
diff --git a/modin/engines/dask/utils.py b/modin/engines/dask/utils.py
new file mode 100644
index 00000000000..ae0c6db4974
--- /dev/null
+++ b/modin/engines/dask/utils.py
@@ -0,0 +1,35 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+from modin.config import CpuCount
+from modin.error_message import ErrorMessage
+
+
+def initialize_dask():
+ from distributed.client import get_client
+
+ try:
+ get_client()
+ except ValueError:
+ from distributed import Client
+
+ # The indentation here is intentional, we want the code to be indented.
+ ErrorMessage.not_initialized(
+ "Dask",
+ """
+ from distributed import Client
+
+ client = Client()
+""",
+ )
+ Client(n_workers=CpuCount.get())
diff --git a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py
index 2099ea9fe93..21989fe1536 100644
--- a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py
+++ b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py
@@ -17,22 +17,26 @@
from .partition import PandasOnRayFramePartition
import ray
+from ray.services import get_node_ip_address
class PandasOnRayFrameAxisPartition(PandasFrameAxisPartition):
- def __init__(self, list_of_blocks):
+ def __init__(self, list_of_blocks, bind_ip=False):
# Unwrap from BaseFramePartition object for ease of use
for obj in list_of_blocks:
obj.drain_call_queue()
self.list_of_blocks = [obj.oid for obj in list_of_blocks]
+ if bind_ip:
+ self.list_of_ips = [obj.ip for obj in list_of_blocks]
partition_type = PandasOnRayFramePartition
- instance_type = ray.ObjectID
+ instance_type = ray.ObjectRef
@classmethod
def deploy_axis_func(
cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions
):
+ lengths = kwargs.get("_lengths", None)
return deploy_ray_func._remote(
args=(
PandasFrameAxisPartition.deploy_axis_func,
@@ -43,7 +47,7 @@ def deploy_axis_func(
maintain_partitioning,
)
+ tuple(partitions),
- num_returns=num_splits * 3,
+ num_returns=num_splits * 4 if lengths is None else len(lengths) * 4,
)
@classmethod
@@ -61,13 +65,13 @@ def deploy_func_between_two_axis_partitions(
kwargs,
)
+ tuple(partitions),
- num_returns=num_splits * 3,
+ num_returns=num_splits * 4,
)
def _wrap_partitions(self, partitions):
return [
- self.partition_type(partitions[i], partitions[i + 1], partitions[i + 2])
- for i in range(0, len(partitions), 3)
+ self.partition_type(object_id, length, width, ip)
+ for (object_id, length, width, ip) in zip(*[iter(partitions)] * 4)
]
@@ -91,20 +95,27 @@ class PandasOnRayFrameRowPartition(PandasOnRayFrameAxisPartition):
@ray.remote
def deploy_ray_func(func, *args): # pragma: no cover
- """Run a function on a remote partition.
-
- Note: Ray functions are not detected by codecov (thus pragma: no cover)
+ """
+ Run a function on a remote partition.
- Args:
- func: The function to run.
+ Parameters
+ ----------
+ func : callable
+ The function to run.
- Returns:
+ Returns
+ -------
The result of the function `func`.
+
+ Notes
+ -----
+ Ray functions are not detected by codecov (thus pragma: no cover)
"""
result = func(*args)
+ ip = get_node_ip_address()
if isinstance(result, pandas.DataFrame):
- return result, len(result), len(result.columns)
+ return result, len(result), len(result.columns), ip
elif all(isinstance(r, pandas.DataFrame) for r in result):
- return [i for r in result for i in [r, len(r), len(r.columns)]]
+ return [i for r in result for i in [r, len(r), len(r.columns), ip]]
else:
- return [i for r in result for i in [r, None, None]]
+ return [i for r in result for i in [r, None, None, ip]]
diff --git a/modin/engines/ray/pandas_on_ray/frame/partition.py b/modin/engines/ray/pandas_on_ray/frame/partition.py
index abe37eace7b..242265ee070 100644
--- a/modin/engines/ray/pandas_on_ray/frame/partition.py
+++ b/modin/engines/ray/pandas_on_ray/frame/partition.py
@@ -18,12 +18,13 @@
from modin.engines.ray.utils import handle_ray_task_error
import ray
+from ray.services import get_node_ip_address
from ray.worker import RayTaskError
class PandasOnRayFramePartition(BaseFramePartition):
- def __init__(self, object_id, length=None, width=None, call_queue=None):
- assert type(object_id) is ray.ObjectID
+ def __init__(self, object_id, length=None, width=None, ip=None, call_queue=None):
+ assert type(object_id) is ray.ObjectRef
self.oid = object_id
if call_queue is None:
@@ -31,6 +32,7 @@ def __init__(self, object_id, length=None, width=None, call_queue=None):
self.call_queue = call_queue
self._length_cache = length
self._width_cache = width
+ self.ip = ip
def get(self):
"""Gets the object out of the plasma store.
@@ -48,7 +50,7 @@ def get(self):
def apply(self, func, **kwargs):
"""Apply a function to the object stored in this partition.
- Note: It does not matter if func is callable or an ObjectID. Ray will
+ Note: It does not matter if func is callable or an ObjectRef. Ray will
handle it correctly either way. The keyword arguments are sent as a
dictionary.
@@ -60,8 +62,8 @@ def apply(self, func, **kwargs):
"""
oid = self.oid
call_queue = self.call_queue + [(func, kwargs)]
- result, length, width = deploy_ray_func.remote(call_queue, oid)
- return PandasOnRayFramePartition(result, length, width)
+ result, length, width, ip = deploy_ray_func.remote(call_queue, oid)
+ return PandasOnRayFramePartition(result, length, width, ip)
def add_to_apply_calls(self, func, **kwargs):
return PandasOnRayFramePartition(
@@ -73,14 +75,17 @@ def drain_call_queue(self):
return
oid = self.oid
call_queue = self.call_queue
- self.oid, self._length_cache, self._width_cache = deploy_ray_func.remote(
- call_queue, oid
- )
+ (
+ self.oid,
+ self._length_cache,
+ self._width_cache,
+ self.ip,
+ ) = deploy_ray_func.remote(call_queue, oid)
self.call_queue = []
def __copy__(self):
return PandasOnRayFramePartition(
- self.oid, self._length_cache, self._width_cache, self.call_queue
+ self.oid, self._length_cache, self._width_cache, call_queue=self.call_queue
)
def to_pandas(self):
@@ -95,7 +100,7 @@ def to_pandas(self):
def to_numpy(self, **kwargs):
"""
- Convert the object stored in this parition to a NumPy array.
+ Convert the object stored in this partition to a NumPy array.
Returns
-------
@@ -150,7 +155,7 @@ def preprocess_func(cls, func):
func: The function to preprocess.
Returns:
- A ray.ObjectID.
+ A ray.ObjectRef.
"""
return ray.put(func)
@@ -162,7 +167,7 @@ def length(self):
self._length_cache, self._width_cache = get_index_and_columns.remote(
self.oid
)
- if isinstance(self._length_cache, ray.ObjectID):
+ if isinstance(self._length_cache, ray.ObjectRef):
try:
self._length_cache = ray.get(self._length_cache)
except RayTaskError as e:
@@ -177,7 +182,7 @@ def width(self):
self._length_cache, self._width_cache = get_index_and_columns.remote(
self.oid
)
- if isinstance(self._width_cache, ray.ObjectID):
+ if isinstance(self._width_cache, ray.ObjectRef):
try:
self._width_cache = ray.get(self._width_cache)
except RayTaskError as e:
@@ -202,10 +207,10 @@ def get_index_and_columns(df):
return len(df.index), len(df.columns)
-@ray.remote(num_returns=3)
+@ray.remote(num_returns=4)
def deploy_ray_func(call_queue, partition): # pragma: no cover
def deserialize(obj):
- if isinstance(obj, ray.ObjectID):
+ if isinstance(obj, ray.ObjectRef):
return ray.get(obj)
return obj
@@ -231,4 +236,5 @@ def deserialize(obj):
result,
len(result) if hasattr(result, "__len__") else 0,
len(result.columns) if hasattr(result, "columns") else 0,
+ get_node_ip_address(),
)
diff --git a/modin/engines/ray/pandas_on_ray/frame/partition_manager.py b/modin/engines/ray/pandas_on_ray/frame/partition_manager.py
index 9ca7ea7a74b..530db055971 100644
--- a/modin/engines/ray/pandas_on_ray/frame/partition_manager.py
+++ b/modin/engines/ray/pandas_on_ray/frame/partition_manager.py
@@ -29,9 +29,9 @@
def func(df, apply_func, call_queue_df=None, call_queues_other=None, *others):
if call_queue_df is not None and len(call_queue_df) > 0:
for call, kwargs in call_queue_df:
- if isinstance(call, ray.ObjectID):
+ if isinstance(call, ray.ObjectRef):
call = ray.get(call)
- if isinstance(kwargs, ray.ObjectID):
+ if isinstance(kwargs, ray.ObjectRef):
kwargs = ray.get(kwargs)
df = call(df, **kwargs)
new_others = np.empty(shape=len(others), dtype=object)
@@ -39,9 +39,9 @@ def func(df, apply_func, call_queue_df=None, call_queues_other=None, *others):
other = others[i]
if call_queue_other is not None and len(call_queue_other) > 0:
for call, kwargs in call_queue_other:
- if isinstance(call, ray.ObjectID):
+ if isinstance(call, ray.ObjectRef):
call = ray.get(call)
- if isinstance(kwargs, ray.ObjectID):
+ if isinstance(kwargs, ray.ObjectRef):
kwargs = ray.get(kwargs)
other = call(other, **kwargs)
new_others[i] = other
diff --git a/modin/engines/ray/pandas_on_ray/io.py b/modin/engines/ray/pandas_on_ray/io.py
index f3f056f9ebf..1b5cec13dc7 100644
--- a/modin/engines/ray/pandas_on_ray/io.py
+++ b/modin/engines/ray/pandas_on_ray/io.py
@@ -14,13 +14,13 @@
from modin.backends.pandas.query_compiler import PandasQueryCompiler
from modin.engines.ray.generic.io import RayIO
from modin.engines.base.io import (
- CSVReader,
- FWFReader,
- JSONReader,
- ParquetReader,
- FeatherReader,
- SQLReader,
- ExcelReader,
+ CSVDispatcher,
+ FWFDispatcher,
+ JSONDispatcher,
+ ParquetDispatcher,
+ FeatherDispatcher,
+ SQLDispatcher,
+ ExcelDispatcher,
)
from modin.backends.pandas.parsers import (
PandasCSVParser,
@@ -45,16 +45,18 @@ class PandasOnRayIO(RayIO):
query_compiler_cls=PandasQueryCompiler,
frame_cls=PandasOnRayFrame,
)
- read_csv = type("", (RayTask, PandasCSVParser, CSVReader), build_args).read
- read_fwf = type("", (RayTask, PandasFWFParser, FWFReader), build_args).read
- read_json = type("", (RayTask, PandasJSONParser, JSONReader), build_args).read
+ read_csv = type("", (RayTask, PandasCSVParser, CSVDispatcher), build_args).read
+ read_fwf = type("", (RayTask, PandasFWFParser, FWFDispatcher), build_args).read
+ read_json = type("", (RayTask, PandasJSONParser, JSONDispatcher), build_args).read
read_parquet = type(
- "", (RayTask, PandasParquetParser, ParquetReader), build_args
+ "", (RayTask, PandasParquetParser, ParquetDispatcher), build_args
).read
# Blocked on pandas-dev/pandas#12236. It is faster to default to pandas.
# read_hdf = type("", (RayTask, PandasHDFParser, HDFReader), build_args).read
read_feather = type(
- "", (RayTask, PandasFeatherParser, FeatherReader), build_args
+ "", (RayTask, PandasFeatherParser, FeatherDispatcher), build_args
+ ).read
+ read_sql = type("", (RayTask, PandasSQLParser, SQLDispatcher), build_args).read
+ read_excel = type(
+ "", (RayTask, PandasExcelParser, ExcelDispatcher), build_args
).read
- read_sql = type("", (RayTask, PandasSQLParser, SQLReader), build_args).read
- read_excel = type("", (RayTask, PandasExcelParser, ExcelReader), build_args).read
diff --git a/modin/engines/ray/utils.py b/modin/engines/ray/utils.py
index af86e63f57f..4641fa5d31b 100644
--- a/modin/engines/ray/utils.py
+++ b/modin/engines/ray/utils.py
@@ -12,7 +12,6 @@
# governing permissions and limitations under the License.
import builtins
-import threading
import os
import sys
@@ -94,12 +93,12 @@ def initialize_ray(
"""
import ray
- if threading.current_thread().name == "MainThread" or override_is_cluster:
+ if not ray.is_initialized() or override_is_cluster:
import secrets
cluster = override_is_cluster or IsRayCluster.get()
redis_address = override_redis_address or RayRedisAddress.get()
- redis_password = override_redis_password or secrets.token_hex(16)
+ redis_password = override_redis_password or secrets.token_hex(32)
if cluster:
# We only start ray in a cluster setting for the head node.
@@ -111,6 +110,17 @@ def initialize_ray(
logging_level=100,
)
else:
+ from modin.error_message import ErrorMessage
+
+ # This string is intentionally formatted this way. We want it indented in
+ # the warning message.
+ ErrorMessage.not_initialized(
+ "Ray",
+ """
+ import ray
+ ray.init()
+""",
+ )
object_store_memory = Memory.get()
plasma_directory = RayPlasmaDir.get()
if IsOutOfCore.get():
@@ -148,7 +158,6 @@ def initialize_ray(
_memory=object_store_memory,
_lru_evict=True,
)
-
_move_stdlib_ahead_of_site_packages()
ray.worker.global_worker.run_function_on_all_workers(
_move_stdlib_ahead_of_site_packages
diff --git a/modin/error_message.py b/modin/error_message.py
index 32c3b231848..59694ad9699 100644
--- a/modin/error_message.py
+++ b/modin/error_message.py
@@ -75,3 +75,11 @@ def missmatch_with_pandas(cls, operation, message):
cls.single_warning(
f"`{operation}` implementation has mismatches with pandas:\n{message}."
)
+
+ @classmethod
+ def not_initialized(cls, engine, code):
+ warnings.warn(
+ "{} execution environment not yet initialized. Initializing...\n"
+ "To remove this warning, run the following python code before doing dataframe operations:\n"
+ "{}".format(engine, code)
+ )
diff --git a/modin/experimental/backends/omnisci/query_compiler.py b/modin/experimental/backends/omnisci/query_compiler.py
index d4552a4f6e6..02da9f6c72a 100644
--- a/modin/experimental/backends/omnisci/query_compiler.py
+++ b/modin/experimental/backends/omnisci/query_compiler.py
@@ -206,10 +206,7 @@ def groupby_size(
else:
shape_hint = None
new_frame = new_frame._set_columns(list(new_frame.columns)[:-1] + ["size"])
- new_qc = self.__constructor__(new_frame, shape_hint=shape_hint)
- if groupby_args["squeeze"]:
- new_qc = new_qc.squeeze()
- return new_qc
+ return self.__constructor__(new_frame, shape_hint=shape_hint)
def groupby_sum(self, by, axis, groupby_args, map_args, **kwargs):
"""Groupby with sum aggregation.
@@ -234,10 +231,7 @@ def groupby_sum(self, by, axis, groupby_args, map_args, **kwargs):
new_frame = self._modin_frame.groupby_agg(
by, axis, "sum", groupby_args, **kwargs
)
- new_qc = self.__constructor__(new_frame)
- if groupby_args["squeeze"]:
- new_qc = new_qc.squeeze()
- return new_qc
+ return self.__constructor__(new_frame)
def groupby_count(self, by, axis, groupby_args, map_args, **kwargs):
"""Perform a groupby count.
@@ -266,40 +260,69 @@ def groupby_count(self, by, axis, groupby_args, map_args, **kwargs):
new_frame = self._modin_frame.groupby_agg(
by, axis, "count", groupby_args, **kwargs
)
- new_qc = self.__constructor__(new_frame)
- if groupby_args["squeeze"]:
- new_qc = new_qc.squeeze()
- return new_qc
+ return self.__constructor__(new_frame)
- def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False):
- """Apply aggregation functions to a grouped dataframe per-column.
+ def groupby_agg(
+ self,
+ by,
+ is_multi_by,
+ axis,
+ agg_func,
+ agg_args,
+ agg_kwargs,
+ groupby_kwargs,
+ drop=False,
+ ):
+ # TODO: handle `is_multi_by`, `agg_args`, `drop` args
+ new_frame = self._modin_frame.groupby_agg(
+ by, axis, agg_func, groupby_kwargs, **agg_kwargs
+ )
+ return self.__constructor__(new_frame)
- Parameters
- ----------
- by : DFAlgQueryCompiler
- The column to group by
- func_dict : dict of str, callable/string
- The dictionary mapping of column to function
- groupby_args : dict
- The dictionary of keyword arguments for the group by.
- agg_args : dict
- The dictionary of keyword arguments for the aggregation functions
- drop : bool
- Whether or not to drop the column from the data.
+ def count(self, **kwargs):
+ return self._agg("count", **kwargs)
- Returns
- -------
- DFAlgQueryCompiler
- The result of the per-column aggregations on the grouped dataframe.
- """
- # TODO: handle drop arg
- new_frame = self._modin_frame.groupby_agg(
- by, 0, func_dict, groupby_args, **agg_args
+ def max(self, **kwargs):
+ return self._agg("max", **kwargs)
+
+ def min(self, **kwargs):
+ return self._agg("min", **kwargs)
+
+ def sum(self, **kwargs):
+ return self._agg("sum", **kwargs)
+
+ def mean(self, **kwargs):
+ return self._agg("mean", **kwargs)
+
+ def _agg(self, agg, axis=0, level=None, **kwargs):
+ if level is not None or axis != 0:
+ return getattr(super(), agg)(axis=axis, level=level, **kwargs)
+
+ skipna = kwargs.get("skipna", True)
+ if not skipna:
+ return getattr(super(), agg)(axis=axis, level=level, **kwargs)
+
+ new_frame = self._modin_frame.agg(agg)
+ new_frame = new_frame._set_index(
+ pandas.Index.__new__(pandas.Index, data=["__reduced__"], dtype="O")
+ )
+ return self.__constructor__(new_frame, shape_hint="row")
+
+ def value_counts(self, **kwargs):
+ subset = kwargs.get("subset", None)
+ normalize = kwargs.get("normalize", False)
+ sort = kwargs.get("sort", True)
+ ascending = kwargs.get("ascending", False)
+ bins = kwargs.get("bins", False)
+ dropna = kwargs.get("dropna", True)
+
+ if bins or normalize:
+ return super().value_count(**kwargs)
+
+ new_frame = self._modin_frame.value_counts(
+ columns=subset, dropna=dropna, sort=sort, ascending=ascending
)
- new_qc = self.__constructor__(new_frame)
- if groupby_args["squeeze"]:
- new_qc = new_qc.squeeze()
- return new_qc
+ return self.__constructor__(new_frame, shape_hint="column")
def _get_index(self):
if self._modin_frame._has_unsupported_data:
@@ -431,6 +454,9 @@ def sub(self, other, **kwargs):
def mul(self, other, **kwargs):
return self._bin_op(other, "mul", **kwargs)
+ def mod(self, other, **kwargs):
+ return self._bin_op(other, "mod", **kwargs)
+
def floordiv(self, other, **kwargs):
return self._bin_op(other, "floordiv", **kwargs)
@@ -494,9 +520,14 @@ def setitem(self, axis, key, value):
if axis == 1 or not isinstance(value, type(self)):
return super().setitem(axis=axis, key=key, value=value)
- return self._setitem(axis, key, value)
+ try:
+ result = self._setitem(axis, key, value)
+ # OmniSci engine does not yet support cases when `value` is not a subframe of `self`.
+ except NotImplementedError:
+ result = super().setitem(axis=axis, key=key, value=value)
+ return result
- _setitem = PandasQueryCompiler.setitem
+ _setitem = PandasQueryCompiler._setitem
def insert(self, loc, column, value):
"""Insert new column data.
@@ -509,6 +540,15 @@ def insert(self, loc, column, value):
Returns:
A new DFAlgQueryCompiler with new data inserted.
"""
+ if isinstance(value, type(self)):
+ value.columns = [column]
+ try:
+ result = self.insert_item(axis=1, loc=loc, value=value)
+ # OmniSci engine does not yet support cases when `value` is not a subframe of `self`.
+ except NotImplementedError:
+ result = super().insert(loc=loc, column=column, value=value)
+ return result
+
if is_list_like(value):
return super().insert(loc=loc, column=column, value=value)
@@ -596,6 +636,24 @@ def has_multiindex(self, axis=0):
assert axis == 1
return isinstance(self.columns, pandas.MultiIndex)
+ def get_index_name(self, axis=0):
+ return self.columns.name if axis else self._modin_frame.get_index_name()
+
+ def set_index_name(self, name, axis=0):
+ if axis == 0:
+ self._modin_frame = self._modin_frame.set_index_name(name)
+ else:
+ self.columns.name = name
+
+ def get_index_names(self, axis=0):
+ return self.columns.names if axis else self._modin_frame.get_index_names()
+
+ def set_index_names(self, names=None, axis=0):
+ if axis == 0:
+ self._modin_frame = self._modin_frame.set_index_names(names)
+ else:
+ self.columns.names = names
+
def free(self):
return
diff --git a/modin/experimental/cloud/ray-autoscaler.yml b/modin/experimental/cloud/ray-autoscaler.yml
index 94bd63f3d96..640e6277457 100644
--- a/modin/experimental/cloud/ray-autoscaler.yml
+++ b/modin/experimental/cloud/ray-autoscaler.yml
@@ -155,7 +155,7 @@ head_start_ray_commands:
echo 'export MEMORY_STORE_SIZE=$(awk "/MemFree/ { printf \"%d \\n\", \$2*1024*0.8}" /proc/meminfo)' >> ~/.bashrc
echo 'export TMPDIR="$(dirname $(mktemp tmp.XXXXXXXXXX -ut))"' >> ~/.bashrc
- ulimit -n 65536; ray start --head --num-redis-shards=1 --redis-shard-ports=6380 --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR
+ ulimit -n 65536; ray start --head --redis-shard-ports=6380 --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
diff --git a/modin/experimental/cloud/rayscale.py b/modin/experimental/cloud/rayscale.py
index ba5f4e9673a..50ccd189050 100644
--- a/modin/experimental/cloud/rayscale.py
+++ b/modin/experimental/cloud/rayscale.py
@@ -21,12 +21,23 @@
import subprocess
import yaml
-from ray.autoscaler.commands import (
- create_or_update_cluster,
- teardown_cluster,
- get_head_node_ip,
- _bootstrap_config,
-)
+
+try:
+ # for ray>=1.0.1
+ from ray.autoscaler.sdk import (
+ create_or_update_cluster,
+ teardown_cluster,
+ get_head_node_ip,
+ bootstrap_config,
+ )
+except ModuleNotFoundError:
+ # for ray==1.0.0
+ from ray.autoscaler.commands import (
+ create_or_update_cluster,
+ teardown_cluster,
+ get_head_node_ip,
+ _bootstrap_config as bootstrap_config,
+ )
from .base import (
CannotSpawnCluster,
@@ -140,7 +151,7 @@ def __make_config(self):
res = self._update_conda_requirements(config["setup_commands"][0])
config["setup_commands"][0] = res
- return _bootstrap_config(config)
+ return bootstrap_config(config)
def _conda_requirements(self):
import shlex
@@ -149,13 +160,15 @@ def _conda_requirements(self):
reqs.extend(self._get_python_version())
- if not any(re.match(r"modin(\W|$)", p) for p in self.add_conda_packages):
- # user didn't define modin release;
- # use automatically detected modin release from local context
- reqs.append(self._get_modin_version())
-
if self.add_conda_packages:
+ if not any(re.match(r"modin(\W|$)", p) for p in self.add_conda_packages):
+ # user didn't define modin release;
+ # use automatically detected modin release from local context
+ reqs.append(self._get_modin_version())
+
reqs.extend(self.add_conda_packages)
+ else:
+ reqs.append(self._get_modin_version())
# this is needed, for example, for dependencies that
# looks like: "scikit-learn>=0.23"
@@ -197,15 +210,9 @@ def __do_spawn(self):
try:
create_or_update_cluster(
self.config_file,
- override_min_workers=None,
- override_max_workers=None,
no_restart=False,
restart_only=False,
- yes=True,
- override_cluster_name=None,
no_config_cache=False,
- redirect_command_output=False,
- use_login_shells=True,
)
# need to re-load the config, as create_or_update_cluster() modifies it
with open(self.config_file) as inp:
@@ -220,13 +227,7 @@ def __do_spawn(self):
def __do_destroy(self):
try:
- teardown_cluster(
- self.config_file,
- yes=True,
- workers_only=False,
- override_cluster_name=None,
- keep_min_workers=0,
- )
+ teardown_cluster(self.config_file)
self.ready = False
self.config = None
except BaseException as ex:
@@ -244,7 +245,7 @@ def _get_connection_details(self) -> ConnectionDetails:
return ConnectionDetails(
user_name=self.config["auth"]["ssh_user"],
key_file=self.config["auth"]["ssh_private_key"],
- address=get_head_node_ip(self.config_file, override_cluster_name=None),
+ address=get_head_node_ip(self.config_file),
)
def _get_main_python(self) -> str:
@@ -262,6 +263,7 @@ def wrap_cmd(self, cmd: list):
[
"bash",
"-ic",
- subprocess.list2cmdline(["conda", "run", "-n", "modin"] + cmd),
+ # workaround for https://github.com/conda/conda/issues/8385
+ subprocess.list2cmdline(["conda", "activate", "modin", "&&"] + cmd),
]
)
diff --git a/modin/experimental/cloud/test/test_cloud.py b/modin/experimental/cloud/test/test_cloud.py
index f0f0d75fabf..15e9e878fc9 100644
--- a/modin/experimental/cloud/test/test_cloud.py
+++ b/modin/experimental/cloud/test/test_cloud.py
@@ -15,20 +15,20 @@
import pytest
from collections import namedtuple
from inspect import signature
-from modin.experimental.cloud.rayscale import RayCluster
-from modin.experimental.cloud.cluster import Provider
-from ray.autoscaler.commands import (
+from modin.experimental.cloud.rayscale import (
+ RayCluster,
create_or_update_cluster,
teardown_cluster,
get_head_node_ip,
- _bootstrap_config,
+ bootstrap_config,
)
+from modin.experimental.cloud.cluster import Provider
@pytest.fixture
def make_bootstrap_config_mock():
def bootstrap_config_mock(config, *args, **kwargs):
- signature(_bootstrap_config).bind(config, *args, **kwargs)
+ signature(bootstrap_config).bind(config, *args, **kwargs)
config["auth"]["ssh_user"] = "modin"
config["auth"]["ssh_private_key"] = "X" * 20
return config
@@ -57,21 +57,21 @@ def make_create_or_update_cluster_mock():
@pytest.fixture
def make_ray_cluster(make_bootstrap_config_mock):
- def ray_cluster():
+ def ray_cluster(conda_packages=None):
with mock.patch(
- "modin.experimental.cloud.rayscale._bootstrap_config",
+ "modin.experimental.cloud.rayscale.bootstrap_config",
make_bootstrap_config_mock,
):
ray_cluster = RayCluster(
Provider(name="aws"),
- add_conda_packages=["scikit-learn>=0.23", "modin==0.8.0"],
+ add_conda_packages=conda_packages,
)
return ray_cluster
return ray_cluster
-def test__bootstrap_config(make_ray_cluster):
+def test_bootstrap_config(make_ray_cluster):
make_ray_cluster()
@@ -107,23 +107,36 @@ def test_create_or_update_cluster(make_ray_cluster, make_create_or_update_cluste
r"""conda create --clone base --name modin --yes
conda activate modin
conda install --yes {{CONDA_PACKAGES}}
-
- pip install modin "ray==0.8.7" cloudpickle
"""
],
)
-def test_update_conda_requirements(setup_commands_source, make_ray_cluster):
+@pytest.mark.parametrize(
+ "user_packages",
+ [
+ ["scikit-learn>=0.23", "modin==0.8.0"],
+ None,
+ ],
+)
+def test_update_conda_requirements(
+ make_ray_cluster,
+ setup_commands_source,
+ user_packages,
+):
fake_version = namedtuple("FakeVersion", "major minor micro")(7, 12, 45)
with mock.patch("sys.version_info", fake_version):
- setup_commands_result = make_ray_cluster()._update_conda_requirements(
- setup_commands_source
- )
+ setup_commands_result = make_ray_cluster(
+ user_packages
+ )._update_conda_requirements(setup_commands_source)
assert f"python>={fake_version.major}.{fake_version.minor}" in setup_commands_result
assert (
f"python<={fake_version.major}.{fake_version.minor}.{fake_version.micro}"
in setup_commands_result
)
- assert "scikit-learn>=0.23" in setup_commands_result
- assert "modin==0.8.0" in setup_commands_result
assert "{{CONDA_PACKAGES}}" not in setup_commands_result
+
+ if user_packages:
+ for package in user_packages:
+ assert package in setup_commands_result
+ else:
+ assert "modin=" in setup_commands_result
diff --git a/modin/experimental/engines/omnisci_on_ray/frame/axis_partition.py b/modin/experimental/engines/omnisci_on_ray/frame/axis_partition.py
index fcdd9a8988a..e7921d7662d 100644
--- a/modin/experimental/engines/omnisci_on_ray/frame/axis_partition.py
+++ b/modin/experimental/engines/omnisci_on_ray/frame/axis_partition.py
@@ -25,7 +25,7 @@ def __init__(self, list_of_blocks):
self.list_of_blocks = [obj.oid for obj in list_of_blocks]
partition_type = OmnisciOnRayFramePartition
- instance_type = ray.ObjectID
+ instance_type = ray.ObjectRef
class OmnisciOnRayFrameColumnPartition(OmnisciOnRayFrameAxisPartition):
diff --git a/modin/experimental/engines/omnisci_on_ray/frame/calcite_builder.py b/modin/experimental/engines/omnisci_on_ray/frame/calcite_builder.py
index f7181b13ca7..e4792ff848d 100644
--- a/modin/experimental/engines/omnisci_on_ray/frame/calcite_builder.py
+++ b/modin/experimental/engines/omnisci_on_ray/frame/calcite_builder.py
@@ -44,7 +44,7 @@
)
from collections import abc
-from pandas.core.dtypes.common import _get_dtype
+from pandas.core.dtypes.common import get_dtype
class CalciteBuilder:
@@ -94,7 +94,7 @@ def gen_agg_exprs(self):
def gen_reduce_expr(self):
count_expr = self._builder._ref(self._arg.modin_frame, self._count_name)
- count_expr._dtype = _get_dtype(int)
+ count_expr._dtype = get_dtype(int)
sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name)
sum_expr._dtype = self._sum_dtype
qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name)
@@ -161,7 +161,7 @@ def gen_agg_exprs(self):
def gen_reduce_expr(self):
count_expr = self._builder._ref(self._arg.modin_frame, self._count_name)
- count_expr._dtype = _get_dtype(int)
+ count_expr._dtype = get_dtype(int)
sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name)
sum_expr._dtype = self._sum_dtype
qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name)
@@ -473,7 +473,7 @@ def _process_join(self, op):
""" Join, only equal-join supported """
cmps = [self._ref(left, c).eq(self._ref(right, c)) for c in op.on]
if len(cmps) > 1:
- condition = OpExpr("AND", cmps, _get_dtype(bool))
+ condition = OpExpr("AND", cmps, get_dtype(bool))
else:
condition = cmps[0]
node = CalciteJoinNode(
diff --git a/modin/experimental/engines/omnisci_on_ray/frame/data.py b/modin/experimental/engines/omnisci_on_ray/frame/data.py
index 5f61346bc24..dfbc857fb4c 100644
--- a/modin/experimental/engines/omnisci_on_ray/frame/data.py
+++ b/modin/experimental/engines/omnisci_on_ray/frame/data.py
@@ -16,7 +16,7 @@
from .partition_manager import OmnisciOnRayFrameManager
from pandas.core.index import ensure_index, Index, MultiIndex, RangeIndex
-from pandas.core.dtypes.common import _get_dtype, is_list_like, is_bool_dtype
+from pandas.core.dtypes.common import get_dtype, is_list_like, is_bool_dtype
from modin.error_message import ErrorMessage
import pandas as pd
@@ -143,7 +143,7 @@ def __init__(
def id_str(self):
return f"frame${self.id}"
- def _get_dtype(self, col):
+ def get_dtype(self, col):
# If we search for an index column type in a MultiIndex then we need to
# extend index column names to tuples.
if isinstance(self._dtypes, MultiIndex) and not isinstance(col, tuple):
@@ -152,8 +152,8 @@ def _get_dtype(self, col):
def ref(self, col):
if col == "__rowid__":
- return InputRefExpr(self, col, _get_dtype(int))
- return InputRefExpr(self, col, self._get_dtype(col))
+ return InputRefExpr(self, col, get_dtype(int))
+ return InputRefExpr(self, col, self.get_dtype(col))
def mask(
self,
@@ -234,8 +234,8 @@ def groupby_agg(self, by, axis, agg, groupby_args, **kwargs):
for obj in by:
if isinstance(obj, str):
by_cols.append(obj)
- elif hasattr(obj, "_query_compiler"):
- by_frames.append(obj._query_compiler._modin_frame)
+ elif hasattr(obj, "_modin_frame"):
+ by_frames.append(obj._modin_frame)
else:
raise NotImplementedError("unsupported groupby args")
by_cols = Index.__new__(Index, data=by_cols, dtype=self.columns.dtype)
@@ -318,6 +318,84 @@ def groupby_agg(self, by, axis, agg, groupby_args, **kwargs):
return new_frame
+ def agg(self, agg):
+ assert isinstance(agg, str)
+
+ agg_exprs = OrderedDict()
+ for col in self.columns:
+ agg_exprs[col] = AggregateExpr(agg, self.ref(col))
+
+ return self.__constructor__(
+ columns=self.columns,
+ dtypes=self._dtypes_for_exprs(agg_exprs),
+ op=GroupbyAggNode(self, [], agg_exprs, {"sort": False}),
+ index_cols=None,
+ force_execution_mode=self._force_execution_mode,
+ )
+
+ def value_counts(self, dropna, columns, sort, ascending):
+ by = [col for col in self.columns if columns is None or col in columns]
+
+ if not by:
+ raise ValueError("invalid columns subset is specified")
+
+ base = self
+ if dropna:
+ checks = [base.ref(col).is_not_null() for col in by]
+ condition = (
+ checks[0]
+ if len(checks) == 1
+ else OpExpr("AND", [checks], np.dtype("bool"))
+ )
+ base = self.__constructor__(
+ columns=Index.__new__(Index, data=by, dtype="O"),
+ dtypes=base._dtypes[by],
+ op=FilterNode(base, condition),
+ index_cols=None,
+ force_execution_mode=base._force_execution_mode,
+ )
+
+ agg_exprs = OrderedDict()
+ agg_exprs[""] = AggregateExpr("size", None)
+ dtypes = base._dtypes[by].tolist()
+ dtypes.append(np.dtype("int64"))
+
+ new_columns = Index.__new__(Index, data=[""], dtype="O")
+
+ res = self.__constructor__(
+ columns=new_columns,
+ dtypes=dtypes,
+ op=GroupbyAggNode(base, by, agg_exprs, {"sort": False}),
+ index_cols=by.copy(),
+ force_execution_mode=base._force_execution_mode,
+ )
+
+ if sort or ascending:
+ res = self.__constructor__(
+ columns=res.columns,
+ dtypes=res._dtypes,
+ op=SortNode(res, [""], [ascending], "last"),
+ index_cols=res._index_cols,
+ force_execution_mode=res._force_execution_mode,
+ )
+
+ # If a single column is used then it keeps its name.
+ # TODO: move it to upper levels when index renaming is in place.
+ if len(by) == 1:
+ exprs = OrderedDict()
+ exprs["__index__"] = res.ref(by[0])
+ exprs[by[0]] = res.ref("")
+
+ res = self.__constructor__(
+ columns=Index.__new__(Index, data=by, dtype="O"),
+ dtypes=self._dtypes_for_exprs(exprs),
+ op=TransformNode(res, exprs),
+ index_cols=["__index__"],
+ force_execution_mode=res._force_execution_mode,
+ )
+
+ return res
+
def fillna(
self,
value=None,
@@ -526,7 +604,7 @@ def _union_all(
assert index_width == 1, "unexpected index width"
aligned_index = ["__index__"]
exprs["__index__"] = frame.ref("__rowid__")
- aligned_index_dtypes = [_get_dtype(int)]
+ aligned_index_dtypes = [get_dtype(int)]
uses_rowid = True
aligned_dtypes = aligned_index_dtypes + new_dtypes
else:
@@ -703,10 +781,10 @@ def cat_codes(self):
col = self.columns[-1]
exprs = self._index_exprs()
col_expr = self.ref(col)
- code_expr = OpExpr("KEY_FOR_STRING", [col_expr], _get_dtype("int32"))
+ code_expr = OpExpr("KEY_FOR_STRING", [col_expr], get_dtype("int32"))
null_val = LiteralExpr(np.int32(-1))
exprs[col] = build_if_then_else(
- col_expr.is_null(), null_val, code_expr, _get_dtype("int32")
+ col_expr.is_null(), null_val, code_expr, get_dtype("int32")
)
return self.__constructor__(
@@ -1062,7 +1140,38 @@ def _get_index(self):
return self._index_cache
def _set_index(self, new_index):
- raise NotImplementedError("OmnisciOnRayFrame._set_index is not yet suported")
+ if not isinstance(new_index, (Index, MultiIndex)):
+ raise NotImplementedError(
+ "OmnisciOnRayFrame._set_index is not yet suported"
+ )
+
+ self._execute()
+
+ assert self._partitions.size == 1
+ obj = self._partitions[0][0].get()
+ if isinstance(obj, pd.DataFrame):
+ raise NotImplementedError(
+ "OmnisciOnRayFrame._set_index is not yet suported"
+ )
+ else:
+ assert isinstance(obj, pyarrow.Table)
+
+ at = obj
+ if self._index_cols:
+ at = at.drop(self._index_cols)
+
+ index_df = pd.DataFrame(data={}, index=new_index.copy())
+ index_df = index_df.reset_index()
+
+ index_at = pyarrow.Table.from_pandas(index_df)
+
+ for i, field in enumerate(at.schema):
+ index_at = index_at.append_column(field, at.column(i))
+
+ index_names = self._mangle_index_names(new_index.names)
+ index_at = index_at.rename_columns(index_names + list(self.columns))
+
+ return self.from_arrow(index_at, index_names, new_index)
def reset_index(self, drop):
if drop:
@@ -1119,13 +1228,76 @@ def _get_columns(self):
return super(OmnisciOnRayFrame, self)._get_columns()
columns = property(_get_columns)
- index = property(_get_index, _set_index)
+ index = property(_get_index)
def has_multiindex(self):
if self._index_cache is not None:
return isinstance(self._index_cache, MultiIndex)
return self._index_cols is not None and len(self._index_cols) > 1
+ def get_index_name(self):
+ if self._index_cols is None:
+ return None
+ if len(self._index_cols) > 1:
+ return None
+ return self._index_cols[0]
+
+ def set_index_name(self, name):
+ if self.has_multiindex():
+ ErrorMessage.single_warning("Scalar name for MultiIndex is not supported!")
+ return self
+
+ if self._index_cols is None and name is None:
+ return self
+
+ names = self._mangle_index_names([name])
+ exprs = OrderedDict()
+ if self._index_cols is None:
+ exprs[names[0]] = self.ref("__rowid__")
+ else:
+ exprs[names[0]] = self.ref(self._index_cols[0])
+
+ for col in self.columns:
+ exprs[col] = self.ref(col)
+
+ return self.__constructor__(
+ columns=self.columns,
+ dtypes=self._dtypes_for_exprs(exprs),
+ op=TransformNode(self, exprs),
+ index_cols=names,
+ uses_rowid=self._index_cols is None,
+ force_execution_mode=self._force_execution_mode,
+ )
+
+ def get_index_names(self):
+ if self.has_multiindex():
+ return self._index_cols.copy()
+ return [self.get_index_name()]
+
+ def set_index_names(self, names):
+ if not self.has_multiindex():
+ raise ValueError("Can set names for MultiIndex only")
+
+ if len(names) != len(self._index_cols):
+ raise ValueError(
+ f"Unexpected names count: expected {len(self._index_cols)} got {len(names)}"
+ )
+
+ names = self._mangle_index_names(names)
+ exprs = OrderedDict()
+ for old, new in zip(self._index_cols, names):
+ exprs[new] = self.ref(old)
+ for col in self.columns:
+ exprs[col] = self.ref(col)
+
+ return self.__constructor__(
+ columns=self.columns,
+ dtypes=self._dtypes_for_exprs(exprs),
+ op=TransformNode(self, exprs),
+ index_cols=names,
+ force_execution_mode=self._force_execution_mode,
+ )
+
def to_pandas(self):
self._execute()
@@ -1196,10 +1368,7 @@ def from_pandas(cls, df):
orig_index_names = df.index.names
orig_df = df
- index_cols = [
- f"__index__{i}_{'__None__' if n is None else n}"
- for i, n in enumerate(df.index.names)
- ]
+ index_cols = cls._mangle_index_names(df.index.names)
df.index.names = index_cols
df = df.reset_index()
@@ -1232,7 +1401,14 @@ def from_pandas(cls, df):
)
@classmethod
- def from_arrow(cls, at):
+ def _mangle_index_names(cls, names):
+ return [
+ f"__index__{i}_{'__None__' if n is None else n}"
+ for i, n in enumerate(names)
+ ]
+
+ @classmethod
+ def from_arrow(cls, at, index_cols=None, index=None):
(
new_frame,
new_lengths,
@@ -1240,11 +1416,18 @@ def from_arrow(cls, at):
unsupported_cols,
) = cls._frame_mgr_cls.from_arrow(at, return_dims=True)
- new_columns = pd.Index(data=at.column_names, dtype="O")
- new_index = pd.RangeIndex(at.num_rows)
+ if index_cols:
+ data_cols = [col for col in at.column_names if col not in index_cols]
+ new_index = index
+ else:
+ data_cols = at.column_names
+ assert index is None
+ new_index = pd.RangeIndex(at.num_rows)
+
+ new_columns = pd.Index(data=data_cols, dtype="O")
new_dtypes = pd.Series(
[cls._arrow_type_to_dtype(col.type) for col in at.columns],
- index=new_columns,
+ index=at.column_names,
)
if len(unsupported_cols) > 0:
@@ -1260,6 +1443,7 @@ def from_arrow(cls, at):
row_lengths=new_lengths,
column_widths=new_widths,
dtypes=new_dtypes,
+ index_cols=index_cols,
has_unsupported_data=len(unsupported_cols) > 0,
)
diff --git a/modin/experimental/engines/omnisci_on_ray/frame/expr.py b/modin/experimental/engines/omnisci_on_ray/frame/expr.py
index 31089bd66c9..bb55f6ff08d 100644
--- a/modin/experimental/engines/omnisci_on_ray/frame/expr.py
+++ b/modin/experimental/engines/omnisci_on_ray/frame/expr.py
@@ -14,7 +14,7 @@
import abc
from pandas.core.dtypes.common import (
is_list_like,
- _get_dtype,
+ get_dtype,
is_float_dtype,
is_integer_dtype,
is_numeric_dtype,
@@ -30,9 +30,9 @@ def _get_common_dtype(lhs_dtype, rhs_dtype):
if lhs_dtype == rhs_dtype:
return lhs_dtype
if is_float_dtype(lhs_dtype) or is_float_dtype(rhs_dtype):
- return _get_dtype(float)
+ return get_dtype(float)
assert is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype)
- return _get_dtype(int)
+ return get_dtype(int)
_aggs_preserving_numeric_type = {"sum", "min", "max"}
@@ -44,9 +44,9 @@ def _agg_dtype(agg, dtype):
if agg in _aggs_preserving_numeric_type:
return dtype
elif agg in _aggs_with_int_result:
- return _get_dtype(int)
+ return get_dtype(int)
elif agg in _aggs_with_float_result:
- return _get_dtype(float)
+ return get_dtype(float)
else:
raise NotImplementedError(f"unsupported aggreagte {agg}")
@@ -63,6 +63,7 @@ class BaseExpr(abc.ABC):
"add": "+",
"sub": "-",
"mul": "*",
+ "mod": "MOD",
"floordiv": "/",
"truediv": "/",
"pow": "POWER",
@@ -74,19 +75,19 @@ class BaseExpr(abc.ABC):
"ne": "<>",
}
- preserve_dtype_math_ops = {"add", "sub", "mul", "floordiv", "pow"}
+ preserve_dtype_math_ops = {"add", "sub", "mul", "mod", "floordiv", "pow"}
promote_to_float_math_ops = {"truediv"}
def eq(self, other):
if not isinstance(other, BaseExpr):
other = LiteralExpr(other)
- new_expr = OpExpr("=", [self, other], _get_dtype(bool))
+ new_expr = OpExpr("=", [self, other], get_dtype(bool))
return new_expr
def le(self, other):
if not isinstance(other, BaseExpr):
other = LiteralExpr(other)
- new_expr = OpExpr("<=", [self, other], _get_dtype(bool))
+ new_expr = OpExpr("<=", [self, other], get_dtype(bool))
return new_expr
def cast(self, res_type):
@@ -94,7 +95,11 @@ def cast(self, res_type):
return new_expr
def is_null(self):
- new_expr = OpExpr("IS NULL", [self], _get_dtype(bool))
+ new_expr = OpExpr("IS NULL", [self], get_dtype(bool))
+ return new_expr
+
+ def is_not_null(self):
+ new_expr = OpExpr("IS NOT NULL", [self], get_dtype(bool))
return new_expr
def bin_op(self, other, op_name):
@@ -107,7 +112,7 @@ def bin_op(self, other, op_name):
# True division may require prior cast to float to avoid integer division
if op_name == "truediv":
if is_integer_dtype(self._dtype) and is_integer_dtype(other._dtype):
- other = other.cast(_get_dtype(float))
+ other = other.cast(get_dtype(float))
res_type = self._get_bin_op_res_type(op_name, self._dtype, other._dtype)
new_expr = OpExpr(self.binary_operations[op_name], [self, other], res_type)
# Floor division may require additional FLOOR expr.
@@ -124,6 +129,9 @@ def sub(self, other):
def mul(self, other):
return self.bin_op(other, "mul")
+ def mod(self, other):
+ return self.bin_op(other, "mod")
+
def truediv(self, other):
return self.bin_op(other, "truediv")
@@ -134,12 +142,12 @@ def pow(self, other):
return self.bin_op(other, "pow")
def floor(self):
- return OpExpr("FLOOR", [self], _get_dtype(int))
+ return OpExpr("FLOOR", [self], get_dtype(int))
def _cmp_op(self, other, op_name):
lhs_dtype_class = self._get_dtype_cmp_class(self._dtype)
rhs_dtype_class = self._get_dtype_cmp_class(other._dtype)
- res_dtype = _get_dtype(bool)
+ res_dtype = get_dtype(bool)
# In OmniSci comparison with NULL always results in NULL,
# but in Pandas it is True for 'ne' comparison and False
# for others.
@@ -172,9 +180,9 @@ def _get_bin_op_res_type(self, op_name, lhs_dtype, rhs_dtype):
if op_name in self.preserve_dtype_math_ops:
return _get_common_dtype(lhs_dtype, rhs_dtype)
elif op_name in self.promote_to_float_math_ops:
- return _get_dtype(float)
+ return get_dtype(float)
elif is_cmp_op(op_name):
- return _get_dtype(bool)
+ return get_dtype(bool)
else:
raise NotImplementedError(f"unsupported binary operation {op_name}")
@@ -226,9 +234,9 @@ def __init__(self, val):
), f"unsupported literal value {val} of type {type(val)}"
self.val = val
if val is None:
- self._dtype = _get_dtype(float)
+ self._dtype = get_dtype(float)
else:
- self._dtype = _get_dtype(type(val))
+ self._dtype = get_dtype(type(val))
def copy(self):
return LiteralExpr(self.val)
@@ -256,7 +264,8 @@ class AggregateExpr(BaseExpr):
def __init__(self, agg, op, distinct=False, dtype=None):
self.agg = agg
self.operands = [op]
- self._dtype = dtype if dtype else _agg_dtype(agg, op._dtype)
+ self._dtype = dtype if dtype else _agg_dtype(agg, op._dtype if op else None)
+ assert self._dtype is not None
self.distinct = distinct
def copy(self):
@@ -289,7 +298,7 @@ def build_row_idx_filter_expr(row_idx, row_col):
for idx in row_idx:
exprs.append(row_col.eq(idx))
- res = OpExpr("OR", exprs, _get_dtype(bool))
+ res = OpExpr("OR", exprs, get_dtype(bool))
return res
@@ -301,6 +310,6 @@ def build_if_then_else(cond, then_val, else_val, res_type):
def build_dt_expr(dt_operation, col_expr):
operation = LiteralExpr(dt_operation)
- res = OpExpr("PG_EXTRACT", [operation, col_expr], _get_dtype(int))
+ res = OpExpr("PG_EXTRACT", [operation, col_expr], get_dtype(int))
return res
diff --git a/modin/experimental/engines/omnisci_on_ray/frame/partition.py b/modin/experimental/engines/omnisci_on_ray/frame/partition.py
index 455c1b0c86c..37e95a9b012 100644
--- a/modin/experimental/engines/omnisci_on_ray/frame/partition.py
+++ b/modin/experimental/engines/omnisci_on_ray/frame/partition.py
@@ -23,7 +23,7 @@ class OmnisciOnRayFramePartition(BaseFramePartition):
def __init__(
self, object_id=None, frame_id=None, arrow_table=None, length=None, width=None
):
- assert type(object_id) is ray.ObjectID
+ assert type(object_id) is ray.ObjectRef
self.oid = object_id
self.frame_id = frame_id
diff --git a/modin/experimental/engines/omnisci_on_ray/io.py b/modin/experimental/engines/omnisci_on_ray/io.py
index cb52bdd31b3..6fa18dabf0c 100644
--- a/modin/experimental/engines/omnisci_on_ray/io.py
+++ b/modin/experimental/engines/omnisci_on_ray/io.py
@@ -129,6 +129,7 @@ def read_csv(
low_memory=True,
memory_map=False,
float_precision=None,
+ storage_options=None,
):
items = locals().copy()
mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
@@ -185,6 +186,9 @@ def read_csv(
null_values=None,
true_values=None,
false_values=None,
+ # timestamp fields should be handled as strings if parse_dates
+ # didn't passed explicitly as an array or a dict
+ timestamp_parsers=[""] if isinstance(parse_dates, bool) else None,
strings_can_be_null=None,
include_columns=None,
include_missing_columns=None,
diff --git a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py
index d1869545fb6..aad5cef9143 100644
--- a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py
+++ b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py
@@ -32,6 +32,7 @@
test_data_keys,
generate_multiindex,
eval_general,
+ eval_io,
)
@@ -54,10 +55,17 @@ def run_and_compare(
**kwargs
):
def run_modin(
- fn, data, data2, force_lazy, force_arrow_execute, allow_subqueries, **kwargs
+ fn,
+ data,
+ data2,
+ force_lazy,
+ force_arrow_execute,
+ allow_subqueries,
+ constructor_kwargs,
+ **kwargs
):
- kwargs["df1"] = pd.DataFrame(data)
- kwargs["df2"] = pd.DataFrame(data2)
+ kwargs["df1"] = pd.DataFrame(data, **constructor_kwargs)
+ kwargs["df2"] = pd.DataFrame(data2, **constructor_kwargs)
kwargs["df"] = kwargs["df1"]
if force_lazy:
@@ -76,9 +84,10 @@ def run_modin(
return exp_res
+ constructor_kwargs = kwargs.pop("constructor_kwargs", {})
try:
- kwargs["df1"] = pandas.DataFrame(data)
- kwargs["df2"] = pandas.DataFrame(data2)
+ kwargs["df1"] = pandas.DataFrame(data, **constructor_kwargs)
+ kwargs["df2"] = pandas.DataFrame(data2, **constructor_kwargs)
kwargs["df"] = kwargs["df1"]
ref_res = fn(lib=pandas, **kwargs)
except Exception as e:
@@ -90,6 +99,7 @@ def run_modin(
force_lazy=force_lazy,
force_arrow_execute=force_arrow_execute,
allow_subqueries=allow_subqueries,
+ constructor_kwargs=constructor_kwargs,
**kwargs
)
_ = exp_res.index
@@ -101,11 +111,13 @@ def run_modin(
force_lazy=force_lazy,
force_arrow_execute=force_arrow_execute,
allow_subqueries=allow_subqueries,
+ constructor_kwargs=constructor_kwargs,
**kwargs
)
df_equals(ref_res, exp_res)
+@pytest.mark.usefixtures("TestReadCSVFixture")
class TestCSV:
root = os.path.abspath(__file__ + "/.." * 6) # root of modin repo
@@ -292,6 +304,30 @@ def test_float32(self):
df_equals(modin_df, pandas_df)
+ # Datetime Handling tests
+ @pytest.mark.parametrize("engine", [None, "arrow"])
+ @pytest.mark.parametrize(
+ "parse_dates",
+ [
+ True,
+ False,
+ ["col2"],
+ ],
+ )
+ def test_read_csv_datetime(
+ self,
+ engine,
+ parse_dates,
+ ):
+
+ eval_io(
+ fn_name="read_csv",
+ md_extra_kwargs={"engine": engine},
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
+ parse_dates=parse_dates,
+ )
+
class TestMasks:
data = {
@@ -392,6 +428,30 @@ def applier(lib):
eval_general(pd, pandas, applier)
+ def test_set_index_name(self):
+ index = pandas.Index.__new__(pandas.Index, data=[i for i in range(24)])
+
+ pandas_df = pandas.DataFrame(self.data, index=index)
+ pandas_df.index.name = "new_name"
+ modin_df = pd.DataFrame(self.data, index=index)
+ modin_df._query_compiler.set_index_name("new_name")
+
+ df_equals(pandas_df, modin_df)
+
+ def test_set_index_names(self):
+ index = pandas.MultiIndex.from_tuples(
+ [(i, j, k) for i in range(2) for j in range(3) for k in range(4)]
+ )
+
+ pandas_df = pandas.DataFrame(self.data, index=index)
+ pandas_df.index.names = ["new_name1", "new_name2", "new_name3"]
+ modin_df = pd.DataFrame(self.data, index=index)
+ modin_df._query_compiler.set_index_names(
+ ["new_name1", "new_name2", "new_name3"]
+ )
+
+ df_equals(pandas_df, modin_df)
+
class TestFillna:
data = {"a": [1, 1, None], "b": [None, None, 2], "c": [3, None, None]}
@@ -452,8 +512,11 @@ def concat(df, **kwargs):
run_and_compare(concat, data=self.data)
- def test_setitem(self):
+ def test_setitem_lazy(self):
def applier(df, **kwargs):
+ df = df + 1
+ df["a"] = df["a"] + 1
+ df["e"] = df["a"] + 1
df["new_int8"] = np.int8(10)
df["new_int16"] = np.int16(10)
df["new_int32"] = np.int32(10)
@@ -465,16 +528,32 @@ def applier(df, **kwargs):
run_and_compare(applier, data=self.data)
- def test_insert(self):
+ def test_setitem_default(self):
+ def applier(df, lib, **kwargs):
+ df = df + 1
+ df["a"] = np.arange(3)
+ df["b"] = lib.Series(np.arange(3))
+ return df
+
+ run_and_compare(applier, data=self.data, force_lazy=False)
+
+ def test_insert_lazy(self):
def applier(df, **kwargs):
- df.insert(0, "new_int", 10)
- df.insert(0, "new_float", 5.5)
- df.insert(0, "new_list_like", np.arange(len(df)))
- df.insert(0, "qc_column", df["new_int"])
+ df = df + 1
+ df.insert(2, "new_int", 10)
+ df.insert(1, "new_float", 5.5)
+ df.insert(0, "new_a", df["a"] + 1)
+ return df
+
+ run_and_compare(applier, data=self.data)
+
+ def test_insert_default(self):
+ def applier(df, lib, **kwargs):
+ df = df + 1
+ df.insert(1, "new_range", np.arange(3))
+ df.insert(1, "new_series", lib.Series(np.arange(3)))
return df
- # setting `force_lazy=False`, because we're expecting to fallback
- # to pandas in that case, which is not supported in lazy mode
run_and_compare(applier, data=self.data, force_lazy=False)
def test_concat_many(self):
@@ -547,6 +626,17 @@ def groupby_count(df, cols, as_index, **kwargs):
run_and_compare(groupby_count, data=self.data, cols=cols, as_index=as_index)
+ @pytest.mark.xfail(
+ reason="Currently mean() passes a lambda into backend which cannot be executed on omnisci backend"
+ )
+ @pytest.mark.parametrize("cols", cols_value)
+ @pytest.mark.parametrize("as_index", bool_arg_values)
+ def test_groupby_mean(self, cols, as_index):
+ def groupby_mean(df, cols, as_index, **kwargs):
+ return df.groupby(cols, as_index=as_index).mean()
+
+ run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index)
+
@pytest.mark.parametrize("cols", cols_value)
@pytest.mark.parametrize("as_index", bool_arg_values)
def test_groupby_proj_sum(self, cols, as_index):
@@ -569,6 +659,26 @@ def groupby(df, **kwargs):
run_and_compare(groupby, data=self.data)
+ @pytest.mark.xfail(
+ reason="Function specified as a string should be passed into backend API, but currently it is transformed into a lambda"
+ )
+ @pytest.mark.parametrize("cols", cols_value)
+ @pytest.mark.parametrize("as_index", bool_arg_values)
+ def test_groupby_agg_mean(self, cols, as_index):
+ def groupby_mean(df, cols, as_index, **kwargs):
+ return df.groupby(cols, as_index=as_index).agg("mean")
+
+ run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index)
+
+ def test_groupby_lazy_multiindex(self):
+ index = generate_multiindex(len(self.data["a"]))
+
+ def groupby(df, *args, **kwargs):
+ df = df + 1
+ return df.groupby("a").agg({"b": "size"})
+
+ run_and_compare(groupby, data=self.data, constructor_kwargs={"index": index})
+
taxi_data = {
"a": [1, 1, 2, 2],
"b": [11, 21, 12, 11],
@@ -844,6 +954,46 @@ def groupby(df, **kwargs):
run_and_compare(groupby, data=self.data)
+class TestAgg:
+ data = {
+ "a": [1, 2, None, None, 1, None],
+ "b": [10, 20, None, 20, 10, None],
+ "c": [None, 200, None, 400, 500, 600],
+ "d": [11, 22, 33, 22, 33, 22],
+ }
+
+ @pytest.mark.parametrize("agg", ["max", "min", "sum", "mean"])
+ @pytest.mark.parametrize("skipna", bool_arg_values)
+ def test_simple_agg(self, agg, skipna):
+ def apply(df, agg, skipna, **kwargs):
+ return getattr(df, agg)(skipna=skipna)
+
+ run_and_compare(apply, data=self.data, agg=agg, skipna=skipna, force_lazy=False)
+
+ def test_count_agg(self):
+ def apply(df, **kwargs):
+ return df.count()
+
+ run_and_compare(apply, data=self.data, force_lazy=False)
+
+ @pytest.mark.parametrize("cols", ["a", "d"])
+ @pytest.mark.parametrize("dropna", [True, False])
+ @pytest.mark.parametrize("sort", [True])
+ @pytest.mark.parametrize("ascending", [True, False])
+ def test_value_counts(self, cols, dropna, sort, ascending):
+ def value_counts(df, cols, dropna, sort, ascending, **kwargs):
+ return df[cols].value_counts(dropna=dropna, sort=sort, ascending=ascending)
+
+ run_and_compare(
+ value_counts,
+ data=self.data,
+ cols=cols,
+ dropna=dropna,
+ sort=sort,
+ ascending=ascending,
+ )
+
+
class TestMerge:
data = {
"a": [1, 2, 3],
@@ -1148,6 +1298,39 @@ def mul2(lib, df):
run_and_compare(mul1, data=self.data)
run_and_compare(mul2, data=self.data)
+ def test_mod_cst(self):
+ def mod(lib, df):
+ return df % 2
+
+ run_and_compare(mod, data=self.data)
+
+ def test_mod_list(self):
+ def mod(lib, df):
+ return df % [2, 3, 4, 5]
+
+ run_and_compare(mod, data=self.data)
+
+ @pytest.mark.parametrize("fill_value", fill_values)
+ def test_mod_method_columns(self, fill_value):
+ def mod1(lib, df, fill_value):
+ return df["a"].mod(df["b"], fill_value=fill_value)
+
+ def mod2(lib, df, fill_value):
+ return df[["a", "c"]].mod(df[["b", "a"]], fill_value=fill_value)
+
+ run_and_compare(mod1, data=self.data, fill_value=fill_value)
+ run_and_compare(mod2, data=self.data, fill_value=fill_value)
+
+ def test_mod_columns(self):
+ def mod1(lib, df):
+ return df["a"] % df["b"]
+
+ def mod2(lib, df):
+ return df[["a", "c"]] % df[["b", "a"]]
+
+ run_and_compare(mod1, data=self.data)
+ run_and_compare(mod2, data=self.data)
+
def test_truediv_cst(self):
def truediv(lib, df):
return df / 2
diff --git a/modin/experimental/engines/pandas_on_ray/io_exp.py b/modin/experimental/engines/pandas_on_ray/io_exp.py
index c093e93708c..9b5781369ce 100644
--- a/modin/experimental/engines/pandas_on_ray/io_exp.py
+++ b/modin/experimental/engines/pandas_on_ray/io_exp.py
@@ -15,9 +15,14 @@
import pandas
import warnings
+from modin.backends.pandas.parsers import _split_result_for_readers, PandasCSVGlobParser
+from modin.backends.pandas.query_compiler import PandasQueryCompiler
from modin.engines.ray.pandas_on_ray.io import PandasOnRayIO
-from modin.backends.pandas.parsers import _split_result_for_readers
+from modin.engines.base.io import CSVGlobDispatcher
+from modin.engines.ray.pandas_on_ray.frame.data import PandasOnRayFrame
from modin.engines.ray.pandas_on_ray.frame.partition import PandasOnRayFramePartition
+from modin.engines.ray.task_wrapper import RayTask
+from modin.config import NPartitions
import ray
@@ -52,6 +57,14 @@ def _read_parquet_columns(path, columns, num_splits, kwargs): # pragma: no cove
class ExperimentalPandasOnRayIO(PandasOnRayIO):
+ build_args = dict(
+ frame_partition_cls=PandasOnRayFramePartition,
+ query_compiler_cls=PandasQueryCompiler,
+ frame_cls=PandasOnRayFrame,
+ )
+ read_csv_glob = type(
+ "", (RayTask, PandasCSVGlobParser, CSVGlobDispatcher), build_args
+ )._read
read_parquet_remote_task = _read_parquet_columns
@classmethod
@@ -117,7 +130,7 @@ def read_sql(
)
# starts the distributed alternative
cols_names, query = get_query_info(sql, con, partition_column)
- num_parts = min(cls.frame_mgr_cls._compute_num_partitions(), max_sessions)
+ num_parts = min(NPartitions.get(), max_sessions)
num_splits = min(len(cols_names), num_parts)
diff = (upper_bound - lower_bound) + 1
min_size = diff // num_parts
@@ -148,7 +161,7 @@ def read_sql(
columns,
chunksize,
),
- num_return_vals=num_splits + 1,
+ num_returns=num_splits + 1,
)
partition_ids.append(
[PandasOnRayFramePartition(obj) for obj in partition_id[:-1]]
diff --git a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py
index be82e790e7b..b7cdb2eaa94 100644
--- a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py
+++ b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py
@@ -46,7 +46,7 @@ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs):
for obj in deploy_ray_func_between_two_axis_partitions._remote(
args=(self.axis, func, num_splits, len(self.list_of_blocks), kwargs)
+ tuple(self.list_of_blocks + other_axis_partition.list_of_blocks),
- num_return_vals=num_splits,
+ num_returns=num_splits,
)
]
@@ -54,7 +54,7 @@ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs):
args.extend(self.list_of_blocks)
return [
PyarrowOnRayFramePartition(obj)
- for obj in deploy_ray_axis_func._remote(args, num_return_vals=num_splits)
+ for obj in deploy_ray_axis_func._remote(args, num_returns=num_splits)
]
def shuffle(self, func, num_splits=None, **kwargs):
@@ -74,7 +74,7 @@ def shuffle(self, func, num_splits=None, **kwargs):
args.extend(self.list_of_blocks)
return [
PyarrowOnRayFramePartition(obj)
- for obj in deploy_ray_axis_func._remote(args, num_return_vals=num_splits)
+ for obj in deploy_ray_axis_func._remote(args, num_returns=num_splits)
]
diff --git a/modin/experimental/engines/pyarrow_on_ray/io.py b/modin/experimental/engines/pyarrow_on_ray/io.py
index d268fe9a36c..d39e13fc2c5 100644
--- a/modin/experimental/engines/pyarrow_on_ray/io.py
+++ b/modin/experimental/engines/pyarrow_on_ray/io.py
@@ -19,10 +19,10 @@
)
from modin.backends.pyarrow.parsers import PyarrowCSVParser
from modin.engines.ray.task_wrapper import RayTask
-from modin.engines.base.io import CSVReader
+from modin.engines.base.io import CSVDispatcher
-class PyarrowOnRayCSVReader(RayTask, PyarrowCSVParser, CSVReader):
+class PyarrowOnRayCSVDispatcher(RayTask, PyarrowCSVParser, CSVDispatcher):
frame_cls = PyarrowOnRayFrame
frame_partition_cls = PyarrowOnRayFramePartition
query_compiler_cls = PyarrowQueryCompiler
@@ -32,7 +32,7 @@ class PyarrowOnRayIO(RayIO):
frame_cls = PyarrowOnRayFrame
frame_partition_cls = PyarrowOnRayFramePartition
query_compiler_cls = PyarrowQueryCompiler
- csv_reader = PyarrowOnRayCSVReader
+ csv_reader = PyarrowOnRayCSVDispatcher
read_parquet_remote_task = None
read_hdf_remote_task = None
diff --git a/modin/experimental/pandas/__init__.py b/modin/experimental/pandas/__init__.py
index baf604d4e8b..fdc5bc6e583 100644
--- a/modin/experimental/pandas/__init__.py
+++ b/modin/experimental/pandas/__init__.py
@@ -19,7 +19,7 @@
# in the user code
from .numpy_wrap import _CAUGHT_NUMPY # noqa F401
from modin.pandas import * # noqa F401, F403
-from .io_exp import read_sql # noqa F401
+from .io_exp import read_sql, read_csv_glob # noqa F401
import warnings
diff --git a/modin/experimental/pandas/io_exp.py b/modin/experimental/pandas/io_exp.py
index ccf73460235..cabd9955d1e 100644
--- a/modin/experimental/pandas/io_exp.py
+++ b/modin/experimental/pandas/io_exp.py
@@ -12,10 +12,15 @@
# governing permissions and limitations under the License.
import inspect
+import pathlib
+from typing import Union, IO, AnyStr
+
+import pandas
from . import DataFrame
+from modin.config import IsExperimental, Engine
from modin.data_management.factories.dispatcher import EngineDispatcher
-from modin.config import IsExperimental
+from ...pandas import _update_engine
def read_sql(
@@ -63,6 +68,121 @@ def read_sql(
Returns:
Pandas Dataframe
"""
+ Engine.subscribe(_update_engine)
assert IsExperimental.get(), "This only works in experimental mode"
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
+
+
+# CSV and table
+def _make_parser_func(sep):
+ """
+ Create a parser function from the given sep.
+
+ Parameters
+ ----------
+ sep: str
+ The separator default to use for the parser.
+
+ Returns
+ -------
+ A function object.
+ """
+
+ def parser_func(
+ filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]],
+ sep=sep,
+ delimiter=None,
+ header="infer",
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ prefix=None,
+ mangle_dupe_cols=True,
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skipinitialspace=False,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ keep_default_na=True,
+ na_filter=True,
+ verbose=False,
+ skip_blank_lines=True,
+ parse_dates=False,
+ infer_datetime_format=False,
+ keep_date_col=False,
+ date_parser=None,
+ dayfirst=False,
+ cache_dates=True,
+ iterator=False,
+ chunksize=None,
+ compression="infer",
+ thousands=None,
+ decimal: str = ".",
+ lineterminator=None,
+ quotechar='"',
+ quoting=0,
+ escapechar=None,
+ comment=None,
+ encoding=None,
+ dialect=None,
+ error_bad_lines=True,
+ warn_bad_lines=True,
+ skipfooter=0,
+ doublequote=True,
+ delim_whitespace=False,
+ low_memory=True,
+ memory_map=False,
+ float_precision=None,
+ ):
+ # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args
+ _pd_read_csv_signature = {
+ val.name for val in inspect.signature(pandas.read_csv).parameters.values()
+ }
+ _, _, _, f_locals = inspect.getargvalues(inspect.currentframe())
+ if f_locals.get("sep", sep) is False:
+ f_locals["sep"] = "\t"
+
+ kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature}
+ return _read(**kwargs)
+
+ return parser_func
+
+
+def _read(**kwargs):
+ """
+ Read csv file from local disk.
+
+ Parameters
+ ----------
+ filepath_or_buffer:
+ The filepath of the csv file.
+ We only support local files for now.
+ kwargs: Keyword arguments in pandas.read_csv
+ """
+ from modin.data_management.factories.dispatcher import EngineDispatcher
+
+ Engine.subscribe(_update_engine)
+
+ try:
+ pd_obj = EngineDispatcher.read_csv_glob(**kwargs)
+ except AttributeError:
+ raise AttributeError("read_csv_glob() is only implemented for pandas on Ray.")
+
+ # This happens when `read_csv` returns a TextFileReader object for iterating through
+ if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
+ reader = pd_obj.read
+ pd_obj.read = lambda *args, **kwargs: DataFrame(
+ query_compiler=reader(*args, **kwargs)
+ )
+ return pd_obj
+
+ return DataFrame(query_compiler=pd_obj)
+
+
+read_csv_glob = _make_parser_func(sep=",")
diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py
index 41f49d1a24e..b3bfd9f932b 100644
--- a/modin/experimental/pandas/test/test_io_exp.py
+++ b/modin/experimental/pandas/test/test_io_exp.py
@@ -15,10 +15,7 @@
import pytest
import modin.experimental.pandas as pd
from modin.config import Engine
-from modin.pandas.test.test_io import ( # noqa: F401
- df_equals,
- make_sql_connection,
-)
+from modin.pandas.test.utils import df_equals
@pytest.mark.skipif(
@@ -63,3 +60,56 @@ def test_from_sql_defaults(make_sql_connection): # noqa: F811
df_equals(modin_df_from_query, pandas_df)
df_equals(modin_df_from_table, pandas_df)
+
+
+@pytest.mark.usefixtures("TestReadGlobCSVFixture")
+@pytest.mark.skipif(
+ Engine.get() != "Ray", reason="Currently only support Ray engine for glob paths."
+)
+class TestCsvGlob:
+ def test_read_multiple_small_csv(self): # noqa: F811
+ pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files])
+ modin_df = pd.read_csv_glob(pytest.glob_path)
+
+ # Indexes get messed up when concatting so we reset both.
+ pandas_df = pandas_df.reset_index(drop=True)
+ modin_df = modin_df.reset_index(drop=True)
+
+ df_equals(modin_df, pandas_df)
+
+ @pytest.mark.parametrize("nrows", [35, 100])
+ def test_read_multiple_csv_nrows(self, request, nrows): # noqa: F811
+ pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files])
+ pandas_df = pandas_df.iloc[:nrows, :]
+
+ modin_df = pd.read_csv_glob(pytest.glob_path, nrows=nrows)
+
+ # Indexes get messed up when concatting so we reset both.
+ pandas_df = pandas_df.reset_index(drop=True)
+ modin_df = modin_df.reset_index(drop=True)
+
+ df_equals(modin_df, pandas_df)
+
+
+@pytest.mark.skipif(
+ Engine.get() != "Ray", reason="Currently only support Ray engine for glob paths."
+)
+def test_read_multiple_csv_s3():
+ modin_df = pd.read_csv_glob("S3://noaa-ghcn-pds/csv/178*.csv")
+
+ # We have to specify the columns because the column names are not identical. Since we specified the column names, we also have to skip the original column names.
+ pandas_dfs = [
+ pandas.read_csv(
+ "s3://noaa-ghcn-pds/csv/178{}.csv".format(i),
+ names=modin_df.columns,
+ skiprows=[0],
+ )
+ for i in range(10)
+ ]
+ pandas_df = pd.concat(pandas_dfs)
+
+ # Indexes get messed up when concatting so we reset both.
+ pandas_df = pandas_df.reset_index(drop=True)
+ modin_df = modin_df.reset_index(drop=True)
+
+ df_equals(modin_df, pandas_df)
diff --git a/modin/experimental/xgboost/__init__.py b/modin/experimental/xgboost/__init__.py
new file mode 100644
index 00000000000..64a667890a2
--- /dev/null
+++ b/modin/experimental/xgboost/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+from .xgboost import ModinDMatrix, train, predict
+
+__all__ = ["ModinDMatrix", "train", "predict"]
diff --git a/modin/experimental/xgboost/test/__init__.py b/modin/experimental/xgboost/test/__init__.py
new file mode 100644
index 00000000000..cae6413e559
--- /dev/null
+++ b/modin/experimental/xgboost/test/__init__.py
@@ -0,0 +1,12 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
diff --git a/modin/experimental/xgboost/test/test_default.py b/modin/experimental/xgboost/test/test_default.py
new file mode 100644
index 00000000000..0fb7f751c1b
--- /dev/null
+++ b/modin/experimental/xgboost/test/test_default.py
@@ -0,0 +1,30 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+
+import pytest
+from modin.config import Engine
+
+import modin.experimental.xgboost as xgb
+
+
+@pytest.mark.skipif(
+ Engine.get() == "Ray",
+ reason="This test doesn't make sense on Ray backend.",
+)
+@pytest.mark.parametrize("func", ["train", "predict"])
+def test_backend(func):
+ try:
+ getattr(xgb, func)({}, xgb.ModinDMatrix(None, None))
+ except ValueError:
+ pass
diff --git a/modin/experimental/xgboost/utils.py b/modin/experimental/xgboost/utils.py
new file mode 100644
index 00000000000..cea7ac5e200
--- /dev/null
+++ b/modin/experimental/xgboost/utils.py
@@ -0,0 +1,51 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import logging
+import xgboost as xgb
+
+LOGGER = logging.getLogger("[modin.xgboost]")
+
+
+class RabitContextManager:
+ def __init__(self, num_workers: int, host_ip):
+ """Start Rabit tracker. The workers connect to this tracker to share
+ their results."""
+
+ self._num_workers = num_workers
+ self.env = {"DMLC_NUM_WORKER": self._num_workers}
+ self.rabit_tracker = xgb.RabitTracker(hostIP=host_ip, nslave=self._num_workers)
+
+ def __enter__(self):
+ self.env.update(self.rabit_tracker.slave_envs())
+ self.rabit_tracker.start(self._num_workers)
+ return self.env
+
+ def __exit__(self, type, value, traceback):
+ self.rabit_tracker.join()
+
+
+class RabitContext:
+ """Context to connect a worker to a rabit tracker"""
+
+ def __init__(self, actor_ip, args):
+ self.args = args
+ self.args.append(("DMLC_TASK_ID=[modin.xgboost]:" + actor_ip).encode())
+
+ def __enter__(self):
+ xgb.rabit.init(self.args)
+ LOGGER.info("-------------- rabit started ------------------")
+
+ def __exit__(self, *args):
+ xgb.rabit.finalize()
+ LOGGER.info("-------------- rabit finished ------------------")
diff --git a/modin/experimental/xgboost/xgboost.py b/modin/experimental/xgboost/xgboost.py
new file mode 100644
index 00000000000..9896ea46f0b
--- /dev/null
+++ b/modin/experimental/xgboost/xgboost.py
@@ -0,0 +1,157 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+
+import logging
+from typing import Dict, Optional
+from multiprocessing import cpu_count
+
+import xgboost as xgb
+
+from modin.config import Engine
+
+LOGGER = logging.getLogger("[modin.xgboost]")
+
+
+class ModinDMatrix(xgb.DMatrix):
+ """
+ DMatrix holding on references to DataFrame.
+
+ Parameters
+ ----------
+ data : DataFrame
+ Data source of DMatrix.
+ label : DataFrame
+ Labels used for training.
+
+ Notes
+ -----
+ Currently ModinDMatrix supports only `data` and `label` parameters.
+ """
+
+ def __init__(self, data, label):
+ self.data = data
+ self.label = label
+
+ def __iter__(self):
+ yield self.data
+ yield self.label
+
+
+def train(
+ params: Dict,
+ dtrain: ModinDMatrix,
+ *args,
+ evals=(),
+ nthread: Optional[int] = cpu_count(),
+ evenly_data_distribution: Optional[bool] = True,
+ **kwargs,
+):
+ """
+ Train XGBoost model.
+
+ Parameters
+ ----------
+ params : dict
+ Booster params.
+ dtrain : ModinDMatrix
+ Data to be trained against.
+ evals: list of pairs (ModinDMatrix, string)
+ List of validation sets for which metrics will evaluated during training.
+ Validation metrics will help us track the performance of the model.
+ nthread : int
+ Number of threads for using in each node. By default it is equal to
+ number of threads on master node.
+ evenly_data_distribution : boolean, default True
+ Whether make evenly distribution of partitions between nodes or not.
+ In case `False` minimal datatransfer between nodes will be provided
+ but the data may not be evenly distributed.
+ \\*\\*kwargs :
+ Other parameters are the same as `xgboost.train` except for
+ `evals_result`, which is returned as part of function return value
+ instead of argument.
+
+ Returns
+ -------
+ dict
+ A dictionary containing trained booster and evaluation history.
+ `history` field is the same as `eval_result` from `xgboost.train`.
+
+ .. code-block:: python
+
+ {'booster': xgboost.Booster,
+ 'history': {'train': {'logloss': ['0.48253', '0.35953']},
+ 'eval': {'logloss': ['0.480385', '0.357756']}}}
+ """
+ LOGGER.info("Training started")
+
+ if Engine.get() == "Ray":
+ from .xgboost_ray import _train
+ else:
+ raise ValueError("Current version supports only Ray engine.")
+
+ result = _train(
+ dtrain, nthread, evenly_data_distribution, params, *args, evals=evals, **kwargs
+ )
+ LOGGER.info("Training finished")
+ return result
+
+
+def predict(
+ model,
+ data: ModinDMatrix,
+ nthread: Optional[int] = cpu_count(),
+ evenly_data_distribution: Optional[bool] = True,
+ **kwargs,
+):
+ """
+ Run prediction with a trained booster.
+
+ Parameters
+ ----------
+ model : A Booster or a dictionary returned by `modin.experimental.xgboost.train`.
+ The trained model.
+ data : ModinDMatrix.
+ Input data used for prediction.
+ nthread : int
+ Number of threads for using in each node. By default it is equal to
+ number of threads on master node.
+ evenly_data_distribution : boolean, default True
+ Whether make evenly distribution of partitions between nodes or not.
+ In case `False` minimal datatransfer between nodes will be provided
+ but the data may not be evenly distributed.
+
+ Returns
+ -------
+ numpy.array
+ Array with prediction results.
+ """
+ LOGGER.info("Prediction started")
+
+ if Engine.get() == "Ray":
+ from .xgboost_ray import _predict
+ else:
+ raise ValueError("Current version supports only Ray engine.")
+
+ if isinstance(model, xgb.Booster):
+ booster = model
+ elif isinstance(model, dict):
+ booster = model["booster"]
+ else:
+ raise TypeError(
+ f"Expected types for `model` xgb.Booster or dict, but presented type is {type(model)}"
+ )
+ result = _predict(booster, data, nthread, evenly_data_distribution, **kwargs)
+ LOGGER.info("Prediction finished")
+
+ return result
diff --git a/modin/experimental/xgboost/xgboost_ray.py b/modin/experimental/xgboost/xgboost_ray.py
new file mode 100644
index 00000000000..fed5d28d365
--- /dev/null
+++ b/modin/experimental/xgboost/xgboost_ray.py
@@ -0,0 +1,395 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+
+import time
+import logging
+from typing import Dict, Optional
+from multiprocessing import cpu_count
+
+import xgboost as xgb
+import ray
+from ray.services import get_node_ip_address
+import numpy as np
+import pandas
+
+from modin.distributed.dataframe.pandas import unwrap_partitions
+from .utils import RabitContext, RabitContextManager
+
+LOGGER = logging.getLogger("[modin.xgboost]")
+
+
+@ray.remote
+class ModinXGBoostActor:
+ def __init__(self, ip, nthread=cpu_count()):
+ self._evals = []
+ self._dpredict = []
+ self._ip = ip
+ self._nthreads = nthread
+
+ LOGGER.info(f"Actor <{self._ip}>, nthread = {self._nthreads} was initialized.")
+
+ def _get_dmatrix(self, X_y):
+ s = time.time()
+ X = X_y[: len(X_y) // 2]
+ y = X_y[len(X_y) // 2 :]
+
+ assert (
+ len(X) == len(y) and len(X) > 0
+ ), "X and y should have the equal length more than 0"
+
+ X = pandas.concat(X, axis=0)
+ y = pandas.concat(y, axis=0)
+ LOGGER.info(f"Concat time: {time.time() - s} s")
+
+ return xgb.DMatrix(X, y)
+
+ def set_train_data(self, *X_y, add_as_eval_method=None):
+ self._dtrain = self._get_dmatrix(X_y)
+
+ if add_as_eval_method is not None:
+ self._evals.append((self._dtrain, add_as_eval_method))
+
+ def set_predict_data(
+ self,
+ *X,
+ ):
+ for x in X:
+ self._dpredict.append(xgb.DMatrix(x, None))
+
+ def add_eval_data(self, *X_y, eval_method):
+ self._evals.append((self._get_dmatrix(X_y), eval_method))
+
+ def train(self, rabit_args, params, *args, **kwargs):
+ local_params = params.copy()
+ local_dtrain = self._dtrain
+ local_evals = self._evals
+
+ local_params["nthread"] = self._nthreads
+
+ evals_result = dict()
+
+ s = time.time()
+ with RabitContext(self._ip, rabit_args):
+ bst = xgb.train(
+ local_params,
+ local_dtrain,
+ *args,
+ evals=local_evals,
+ evals_result=evals_result,
+ **kwargs,
+ )
+ LOGGER.info(f"Local training time: {time.time() - s} s")
+ return {"booster": bst, "history": evals_result}
+
+ def predict(self, booster: xgb.Booster, *args, **kwargs):
+ local_dpredict = self._dpredict
+ booster.set_param({"nthread": self._nthreads})
+
+ s = time.time()
+ predictions = [booster.predict(X, *args, **kwargs) for X in local_dpredict]
+ LOGGER.info(f"Local prediction time: {time.time() - s} s")
+ return np.concatenate(predictions)
+
+ def exit_actor(self):
+ ray.actor.exit_actor()
+
+
+def create_actors(num_cpus=1, nthread=cpu_count()):
+ num_nodes = len(ray.nodes())
+
+ # Create remote actors
+ actors = {
+ node_info.split("node:")[-1]: ModinXGBoostActor.options(
+ num_cpus=num_cpus, resources={node_info: 1.0}
+ ).remote(node_info.split("node:")[-1], nthread=nthread)
+ for node_info in ray.cluster_resources()
+ if "node" in node_info
+ }
+
+ assert num_nodes == len(
+ actors
+ ), f"Number of nodes {num_nodes} is not equal to number of actors {len(actors)}."
+
+ return actors
+
+
+def _split_data_across_actors(
+ actors: Dict, set_func, X_parts, y_parts=None, evenly_data_distribution=True
+):
+ """
+ Split row partitions of data between actors.
+
+ Parameters
+ ----------
+ actors : dict
+ Dictionary of used actors.
+ set_func : callable
+ The function for setting data in actor.
+ X_parts : list
+ Row partitions of X data.
+ y_parts : list, default None
+ Row partitions of y data.
+ evenly_data_distribution : boolean, default True
+ Whether make evenly distribution of partitions between nodes or not.
+ In case `False` minimal datatransfer between nodes will be provided
+ but the data may not be evenly distributed.
+ """
+ X_parts_by_actors = _assign_row_partitions_to_actors(
+ actors, X_parts, evenly_data_distribution=evenly_data_distribution
+ )
+
+ if y_parts is not None:
+ y_parts_by_actors = _assign_row_partitions_to_actors(
+ actors,
+ y_parts,
+ X_parts_by_actors,
+ evenly_data_distribution=evenly_data_distribution,
+ )
+
+ for ip, actor in actors.items():
+ X_parts = X_parts_by_actors[ip][0]
+ if y_parts is None:
+ set_func(actor, *X_parts)
+ else:
+ y_parts = y_parts_by_actors[ip][0]
+ set_func(actor, *(X_parts + y_parts))
+
+
+def _assign_row_partitions_to_actors(
+ actors: Dict, row_partitions, data_for_aligning=None, evenly_data_distribution=True
+):
+ """
+ Assign row_partitions to actors.
+
+ Parameters
+ ----------
+ actors : dict
+ Dictionary of used actors.
+ row_partitions : list
+ Row partitions of data to assign.
+ data_for_aligning : dict, default None
+ Data according to the order of which should be
+ distributed row_partitions. Used to align y with X.
+ evenly_data_distribution : boolean, default True
+ Whether make evenly distribution of partitions between nodes or not.
+ In case `False` minimal datatransfer between nodes will be provided
+ but the data may not be evenly distributed.
+
+ Returns
+ -------
+ dict
+ Dictionary of assigned to actors partitions
+ as {ip: (partitions, order)}.
+ """
+ row_partitions_by_actors = {ip: ([], []) for ip in actors}
+ if evenly_data_distribution:
+ _assign_partitions_evenly(
+ actors,
+ row_partitions,
+ False,
+ row_partitions_by_actors,
+ )
+ else:
+ if data_for_aligning is None:
+ actors_ips = list(actors.keys())
+ partitions_ips = [ray.get(row_part[0]) for row_part in row_partitions]
+ unique_partitions_ips = set(partitions_ips)
+ empty_actor_ips = []
+ for ip in actors_ips:
+ if ip not in unique_partitions_ips:
+ empty_actor_ips.append(ip)
+
+ # In case portion of nodes without data is less than 10%,
+ # no data redistribution between nodes will be performed.
+ if len(empty_actor_ips) / len(actors_ips) < 0.1:
+ import warnings
+
+ for ip in empty_actor_ips:
+ actors[ip].exit_actor.remote()
+ actors.pop(ip)
+ row_partitions_by_actors.pop(ip)
+ warnings.warn(
+ f"Node {ip} isn't used as it doesn't contain any data."
+ )
+ for i, row_part in enumerate(row_partitions):
+ row_partitions_by_actors[partitions_ips[i]][0].append(row_part[1])
+ row_partitions_by_actors[partitions_ips[i]][1].append(i)
+ else:
+ _assign_partitions_evenly(
+ actors,
+ row_partitions,
+ True,
+ row_partitions_by_actors,
+ )
+ else:
+ for ip, (_, order_of_indexes) in data_for_aligning.items():
+ row_partitions_by_actors[ip][1].extend(order_of_indexes)
+ for row_idx in order_of_indexes:
+ row_partitions_by_actors[ip][0].append(row_partitions[row_idx][1])
+
+ return row_partitions_by_actors
+
+
+def _assign_partitions_evenly(
+ actors: Dict,
+ row_partitions,
+ is_partitions_have_ip,
+ row_partitions_by_actors: Dict,
+):
+ """
+ Make evenly assigning of row_partitions to actors.
+
+ Parameters
+ ----------
+ actors : dict
+ Dictionary of used actors.
+ row_partitions : list
+ Row partitions of data to assign.
+ is_partitions_have_ip : boolean
+ Whether each value of row_partitions is (ip, partition).
+ row_partitions_by_actors : dict
+ Dictionary of assigned to actors partitions
+ as {ip: (partitions, order)}. Output parameter.
+ """
+ num_actors = len(actors)
+ row_parts_last_idx = (
+ len(row_partitions) // num_actors
+ if len(row_partitions) % num_actors == 0
+ else len(row_partitions) // num_actors + 1
+ )
+
+ start_idx = 0
+ for ip, actor in actors.items():
+ if is_partitions_have_ip:
+ last_idx = (
+ (start_idx + row_parts_last_idx)
+ if (start_idx + row_parts_last_idx < len(row_partitions))
+ else len(row_partitions)
+ )
+ row_partitions_by_actors[ip][1].extend(list(range(start_idx, last_idx)))
+ for idx in range(start_idx, last_idx):
+ row_partitions_by_actors[ip][0].append(row_partitions[idx][1])
+ else:
+ idx_slice = (
+ slice(start_idx, start_idx + row_parts_last_idx)
+ if start_idx + row_parts_last_idx < len(row_partitions)
+ else slice(start_idx, len(row_partitions))
+ )
+ row_partitions_by_actors[ip][0].extend(row_partitions[idx_slice])
+ start_idx += row_parts_last_idx
+
+
+def _train(
+ dtrain,
+ nthread,
+ evenly_data_distribution,
+ params: Dict,
+ *args,
+ evals=(),
+ **kwargs,
+):
+ s = time.time()
+
+ X, y = dtrain
+ assert len(X) == len(y)
+
+ X_row_parts = unwrap_partitions(X, axis=0, bind_ip=not evenly_data_distribution)
+ y_row_parts = unwrap_partitions(y, axis=0, bind_ip=not evenly_data_distribution)
+ assert len(X_row_parts) == len(y_row_parts), "Unaligned train data"
+
+ # Create remote actors
+ actors = create_actors(nthread=nthread)
+
+ add_as_eval_method = None
+ if evals:
+ for (eval_data, method) in evals[:]:
+ if eval_data is dtrain:
+ add_as_eval_method = method
+ evals.remove((eval_data, method))
+
+ for ((eval_X, eval_y), eval_method) in evals:
+ # Split data across workers
+ _split_data_across_actors(
+ actors,
+ lambda actor, *X_y: actor.add_eval_data.remote(
+ *X_y, eval_method=eval_method
+ ),
+ unwrap_partitions(eval_X, axis=0, bind_ip=not evenly_data_distribution),
+ unwrap_partitions(eval_y, axis=0, bind_ip=not evenly_data_distribution),
+ evenly_data_distribution=evenly_data_distribution,
+ )
+
+ # Split data across workers
+ _split_data_across_actors(
+ actors,
+ lambda actor, *X_y: actor.set_train_data.remote(
+ *X_y, add_as_eval_method=add_as_eval_method
+ ),
+ X_row_parts,
+ y_row_parts,
+ evenly_data_distribution=evenly_data_distribution,
+ )
+ LOGGER.info(f"Data preparation time: {time.time() - s} s")
+
+ s = time.time()
+ with RabitContextManager(len(actors), get_node_ip_address()) as env:
+ rabit_args = [("%s=%s" % item).encode() for item in env.items()]
+
+ # Train
+ fut = [
+ actor.train.remote(rabit_args, params, *args, **kwargs)
+ for _, actor in actors.items()
+ ]
+
+ # All results should be the same because of Rabit tracking. So we just
+ # return the first one.
+ result = ray.get(fut[0])
+ LOGGER.info(f"Training time: {time.time() - s} s")
+ return result
+
+
+def _predict(
+ booster,
+ data,
+ nthread: Optional[int] = cpu_count(),
+ evenly_data_distribution: Optional[bool] = True,
+ **kwargs,
+):
+ s = time.time()
+
+ X, _ = data
+ X_row_parts = unwrap_partitions(X, axis=0, bind_ip=not evenly_data_distribution)
+
+ # Create remote actors
+ actors = create_actors(nthread=nthread)
+
+ # Split data across workers
+ _split_data_across_actors(
+ actors,
+ lambda actor, *X: actor.set_predict_data.remote(*X),
+ X_row_parts,
+ evenly_data_distribution=evenly_data_distribution,
+ )
+
+ LOGGER.info(f"Data preparation time: {time.time() - s} s")
+ s = time.time()
+
+ # Predict
+ predictions = [
+ actor.predict.remote(booster, **kwargs) for _, actor in actors.items()
+ ]
+ result = ray.get(predictions)
+ LOGGER.info(f"Prediction time: {time.time() - s} s")
+
+ return np.concatenate(result)
diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py
index 5ddc9c33c07..efc0271dc1e 100644
--- a/modin/pandas/__init__.py
+++ b/modin/pandas/__init__.py
@@ -13,7 +13,7 @@
import pandas
-__pandas_version__ = "1.1.3"
+__pandas_version__ = "1.2.1"
if pandas.__version__ != __pandas_version__:
import warnings
@@ -43,6 +43,7 @@
to_timedelta,
set_eng_float_format,
options,
+ Flags,
set_option,
NaT,
PeriodIndex,
@@ -57,6 +58,8 @@
Int16Dtype,
Int32Dtype,
Int64Dtype,
+ Float32Dtype,
+ Float64Dtype,
StringDtype,
BooleanDtype,
CategoricalDtype,
@@ -83,7 +86,6 @@
NamedAgg,
NA,
)
-import threading
import os
import multiprocessing
@@ -91,9 +93,6 @@
# Set this so that Pandas doesn't try to multithread by itself
os.environ["OMP_NUM_THREADS"] = "1"
-DEFAULT_NPARTITIONS = 4
-num_cpus = 1
-
_is_first_update = {}
dask_client = None
@@ -103,11 +102,10 @@
def _update_engine(publisher: Parameter):
- global DEFAULT_NPARTITIONS, dask_client, num_cpus
+ global dask_client
from modin.config import Backend, CpuCount
if publisher.get() == "Ray":
- import ray
from modin.engines.ray.utils import initialize_ray
# With OmniSci backend there is only a single worker per node
@@ -117,29 +115,15 @@ def _update_engine(publisher: Parameter):
os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
if _is_first_update.get("Ray", True):
initialize_ray()
- num_cpus = ray.cluster_resources()["CPU"]
- elif publisher.get() == "Dask": # pragma: no cover
- from distributed.client import get_client
-
- if threading.current_thread().name == "MainThread" and _is_first_update.get(
- "Dask", True
- ):
- import warnings
-
- warnings.warn("The Dask Engine for Modin is experimental.")
-
- try:
- dask_client = get_client()
- except ValueError:
- from distributed import Client
-
- dask_client = Client(n_workers=CpuCount.get())
+ elif publisher.get() == "Dask":
+ if _is_first_update.get("Dask", True):
+ from modin.engines.dask.utils import initialize_dask
+ initialize_dask()
elif publisher.get() == "Cloudray":
from modin.experimental.cloud import get_connection
conn = get_connection()
- remote_ray = conn.modules["ray"]
if _is_first_update.get("Cloudray", True):
@conn.teleport
@@ -161,8 +145,6 @@ def init_remote_ray(partition):
import modin.data_management.factories.dispatcher # noqa: F401
else:
get_connection().modules["modin"].set_backends("Ray", Backend.get())
-
- num_cpus = remote_ray.cluster_resources()["CPU"]
elif publisher.get() == "Cloudpython":
from modin.experimental.cloud import get_connection
@@ -172,10 +154,7 @@ def init_remote_ray(partition):
raise ImportError("Unrecognized execution engine: {}.".format(publisher.get()))
_is_first_update[publisher.get()] = False
- DEFAULT_NPARTITIONS = max(4, int(num_cpus))
-
-Engine.subscribe(_update_engine)
from .. import __version__
from .dataframe import DataFrame
@@ -332,7 +311,6 @@ def init_remote_ray(partition):
"value_counts",
"datetime",
"NamedAgg",
- "DEFAULT_NPARTITIONS",
]
del pandas, Engine, Parameter
diff --git a/modin/pandas/accessor.py b/modin/pandas/accessor.py
new file mode 100644
index 00000000000..b4895b7eabc
--- /dev/null
+++ b/modin/pandas/accessor.py
@@ -0,0 +1,111 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import pandas
+from pandas.core.arrays.sparse.dtype import SparseDtype
+
+from modin.utils import _inherit_docstrings
+
+
+class BaseSparseAccessor:
+ _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
+
+ def __init__(self, data=None):
+ self._parent = data
+ self._validate(data)
+
+ def _validate(self, data):
+ raise NotImplementedError
+
+ def _default_to_pandas(self, op, *args, **kwargs):
+ return self._parent._default_to_pandas(
+ lambda parent: op(parent.sparse, *args, **kwargs)
+ )
+
+
+@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseFrameAccessor)
+class SparseFrameAccessor(BaseSparseAccessor):
+ def _validate(self, data):
+ dtypes = data.dtypes
+ if not all(isinstance(t, SparseDtype) for t in dtypes):
+ raise AttributeError(self._validation_msg)
+
+ @property
+ def density(self):
+ return self._parent._default_to_pandas(pandas.DataFrame.sparse).density
+
+ @classmethod
+ def from_spmatrix(cls, data, index=None, columns=None):
+ return cls._default_to_pandas(
+ pandas.DataFrame.sparse.from_spmatrix, data, index=index, columns=columns
+ )
+
+ def to_dense(self):
+ return self._default_to_pandas(pandas.DataFrame.sparse.to_dense)
+
+ def to_coo(self):
+ return self._default_to_pandas(pandas.DataFrame.sparse.to_coo)
+
+
+@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseAccessor)
+class SparseAccessor(BaseSparseAccessor):
+ def _validate(self, data):
+ if not isinstance(data.dtype, SparseDtype):
+ raise AttributeError(self._validation_msg)
+
+ @property
+ def density(self):
+ return self._parent._default_to_pandas(pandas.Series.sparse).density
+
+ @property
+ def fill_value(self):
+ return self._parent._default_to_pandas(pandas.Series.sparse).fill_value
+
+ @property
+ def npoints(self):
+ return self._parent._default_to_pandas(pandas.Series.sparse).npoints
+
+ @property
+ def sp_values(self):
+ return self._parent._default_to_pandas(pandas.Series.sparse).sp_values
+
+ @classmethod
+ def from_coo(cls, A, dense_index=False):
+ return cls._default_to_pandas(
+ pandas.Series.sparse.from_coo, A, dense_index=dense_index
+ )
+
+ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
+ return self._default_to_pandas(
+ pandas.Series.sparse.to_coo,
+ row_levels=row_levels,
+ column_levels=column_levels,
+ sort_labels=sort_labels,
+ )
+
+ def to_dense(self):
+ return self._default_to_pandas(pandas.Series.sparse.to_dense)
+
+
+@_inherit_docstrings(pandas.core.accessor.CachedAccessor)
+class CachedAccessor:
+ def __init__(self, name: str, accessor) -> None:
+ self._name = name
+ self._accessor = accessor
+
+ def __get__(self, obj, cls):
+ if obj is None:
+ return self._accessor
+ accessor_obj = self._accessor(obj)
+ object.__setattr__(obj, self._name, accessor_obj)
+ return accessor_obj
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
index 74872889c1c..a40015023f9 100644
--- a/modin/pandas/base.py
+++ b/modin/pandas/base.py
@@ -41,9 +41,10 @@
from pandas.util._validators import validate_bool_kwarg, validate_percentile
from pandas._libs.lib import no_default
from pandas._typing import (
+ IndexKeyFunc,
+ StorageOptions,
TimedeltaConvertibleTypes,
TimestampConvertibleTypes,
- IndexKeyFunc,
)
import re
from typing import Optional, Union
@@ -207,6 +208,7 @@ def _validate_other(
numeric_or_time_only=False,
numeric_or_object_only=False,
comparison_dtypes_only=False,
+ compare_index=False,
):
"""
Help to check validity of other in inter-df operations.
@@ -260,6 +262,9 @@ def _validate_other(
else len(self._query_compiler.columns)
)
]
+ if compare_index:
+ if not self.index.equals(other.index):
+ raise TypeError("Cannot perform operation with non-equal index")
# Do dtype checking.
if numeric_only:
if not all(
@@ -334,6 +339,18 @@ def _binary_op(self, op, other, **kwargs):
getattr(getattr(pandas, type(self).__name__), op), other, **kwargs
)
other = self._validate_other(other, axis, numeric_or_object_only=True)
+ exclude_list = [
+ "__add__",
+ "__radd__",
+ "__and__",
+ "__rand__",
+ "__or__",
+ "__ror__",
+ "__xor__",
+ "__rxor__",
+ ]
+ if op in exclude_list:
+ kwargs.pop("axis")
new_query_compiler = getattr(self._query_compiler, op)(other, **kwargs)
return self._create_or_update_from_compiler(new_query_compiler)
@@ -488,17 +505,11 @@ def add(self, other, axis="columns", level=None, fill_value=None):
)
def aggregate(self, func=None, axis=0, *args, **kwargs):
- warnings.warn(
- "Modin index may not match pandas index due to pandas issue pandas-dev/pandas#36189."
- )
axis = self._get_axis_number(axis)
result = None
if axis == 0:
- try:
- result = self._aggregate(func, _axis=axis, *args, **kwargs)
- except TypeError:
- pass
+ result = self._aggregate(func, _axis=axis, *args, **kwargs)
if result is None:
kwargs.pop("is_transform", None)
return self.apply(func, axis=axis, args=args, **kwargs)
@@ -506,22 +517,22 @@ def aggregate(self, func=None, axis=0, *args, **kwargs):
agg = aggregate
- def _aggregate(self, arg, *args, **kwargs):
+ def _aggregate(self, func, *args, **kwargs):
_axis = kwargs.pop("_axis", 0)
kwargs.pop("_level", None)
- if isinstance(arg, str):
+ if isinstance(func, str):
kwargs.pop("is_transform", None)
- return self._string_function(arg, *args, **kwargs)
+ return self._string_function(func, *args, **kwargs)
# Dictionaries have complex behavior because they can be renamed here.
- elif isinstance(arg, dict):
- return self._default_to_pandas("agg", arg, *args, **kwargs)
- elif is_list_like(arg) or callable(arg):
+ elif func is None or isinstance(func, dict):
+ return self._default_to_pandas("agg", func, *args, **kwargs)
+ elif is_list_like(func) or callable(func):
kwargs.pop("is_transform", None)
- return self.apply(arg, axis=_axis, args=args, **kwargs)
+ return self.apply(func, axis=_axis, args=args, **kwargs)
else:
- raise TypeError("type {} is not callable".format(type(arg)))
+ raise TypeError("type {} is not callable".format(type(func)))
def _string_function(self, func, *args, **kwargs):
assert isinstance(func, str)
@@ -686,9 +697,6 @@ def apply(
args=(),
**kwds,
):
- warnings.warn(
- "Modin index may not match pandas index due to pandas issue pandas-dev/pandas#36189."
- )
axis = self._get_axis_number(axis)
ErrorMessage.non_verified_udf()
if isinstance(func, str):
@@ -786,19 +794,26 @@ def at(self, axis=None):
def at_time(self, time, asof=False, axis=None):
axis = self._get_axis_number(axis)
- if axis == 0:
- return self.iloc[self.index.indexer_at_time(time, asof=asof)]
- return self.iloc[:, self.columns.indexer_at_time(time, asof=asof)]
+ idx = self.index if axis == 0 else self.columns
+ indexer = pandas.Series(index=idx).at_time(time, asof=asof).index
+ return self.loc[indexer] if axis == 0 else self.loc[:, indexer]
def between_time(
self, start_time, end_time, include_start=True, include_end=True, axis=None
):
axis = self._get_axis_number(axis)
idx = self.index if axis == 0 else self.columns
- indexer = idx.indexer_between_time(
- start_time, end_time, include_start=include_start, include_end=include_end
+ indexer = (
+ pandas.Series(index=idx)
+ .between_time(
+ start_time,
+ end_time,
+ include_start=include_start,
+ include_end=include_end,
+ )
+ .index
)
- return self.iloc[indexer] if axis == 0 else self.iloc[:, indexer]
+ return self.loc[indexer] if axis == 0 else self.loc[:, indexer]
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
return self.fillna(
@@ -1296,6 +1311,7 @@ def convert_dtypes(
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
+ convert_floating: bool = True,
):
return self._default_to_pandas(
"convert_dtypes",
@@ -1334,10 +1350,17 @@ def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
query_compiler=self._query_compiler.apply("kurt", axis, **func_kwargs)
)
- if numeric_only:
+ if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
+
+ data = (
+ self._get_numeric_data(axis)
+ if numeric_only is None or numeric_only
+ else self
+ )
+
return self._reduce_dimension(
- self._query_compiler.kurt(
+ data._query_compiler.kurt(
axis=axis,
skipna=skipna,
level=level,
@@ -1400,6 +1423,15 @@ def mask(
)
def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
+ if level is not None:
+ return self._default_to_pandas(
+ "max",
+ axis=axis,
+ skipna=skipna,
+ level=level,
+ numeric_only=numeric_only,
+ **kwargs,
+ )
axis = self._get_axis_number(axis)
data = self._validate_dtypes_min_max(axis, numeric_only)
return data._reduce_dimension(
@@ -1412,19 +1444,82 @@ def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
)
)
- def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
+ def _stat_operation(
+ self,
+ op_name: str,
+ axis: Union[int, str],
+ skipna: bool,
+ level: Optional[Union[int, str]],
+ numeric_only: Optional[bool] = None,
+ **kwargs,
+ ):
+ """
+ Do common statistic reduce operations under frame.
+
+ Parameters
+ ----------
+ op_name: str,
+ Name of method to apply.
+ axis: int or axis name,
+ Axis to apply method on.
+ skipna: bool,
+ Exclude NA/null values when computing the result.
+ level: int or level name,
+ If specified `axis` is a MultiIndex, applying method along a particular
+ level, collapsing into a Series
+ numeric_only: bool
+ Include only float, int, boolean columns. If None, will attempt
+ to use everything, then use only numeric data.
+
+ Returns
+ -------
+ In case of Series: scalar or Series (if level specified)
+ In case of DataFrame: Series of DataFrame (if level specified)
+
+ """
axis = self._get_axis_number(axis)
- data = self._validate_dtypes_sum_prod_mean(
- axis, numeric_only, ignore_axis=False
- )
- return data._reduce_dimension(
- data._query_compiler.mean(
+ if level is not None:
+ return self._default_to_pandas(
+ op_name,
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
+ # If `numeric_only` is None, then we can do this precheck to whether or not
+ # frame contains non-numeric columns, if it doesn't, then we can pass to a backend
+ # `numeric_only=False` parameter and make its work easier in that case, rather than
+ # performing under complicate `numeric_only=None` parameter
+ if not numeric_only:
+ try:
+ self._validate_dtypes(numeric_only=True)
+ except TypeError:
+ if numeric_only is not None:
+ raise
+ else:
+ numeric_only = False
+
+ data = (
+ self._get_numeric_data(axis)
+ if numeric_only is None or numeric_only
+ else self
+ )
+ result_qc = getattr(data._query_compiler, op_name)(
+ axis=axis,
+ skipna=skipna,
+ level=level,
+ numeric_only=numeric_only,
+ **kwargs,
+ )
+ return self._reduce_dimension(result_qc)
+
+ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
+ return self._stat_operation("mean", axis, skipna, level, numeric_only, **kwargs)
+
+ def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
+ return self._stat_operation(
+ "median", axis, skipna, level, numeric_only, **kwargs
)
def memory_usage(self, index=True, deep=False):
@@ -1433,6 +1528,15 @@ def memory_usage(self, index=True, deep=False):
)
def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
+ if level is not None:
+ return self._default_to_pandas(
+ "min",
+ axis=axis,
+ skipna=skipna,
+ level=level,
+ numeric_only=numeric_only,
+ **kwargs,
+ )
axis = self._get_axis_number(axis)
data = self._validate_dtypes_min_max(axis, numeric_only)
return data._reduce_dimension(
@@ -1537,7 +1641,6 @@ def check_dtype(t):
# check that all qs are between 0 and 1
validate_percentile(q)
axis = self._get_axis_number(axis)
-
if isinstance(q, (pandas.Series, np.ndarray, pandas.Index, list)):
return self.__constructor__(
query_compiler=self._query_compiler.quantile_for_list_of_values(
@@ -1583,58 +1686,28 @@ def rank(
def reindex(
self,
- labels=None,
index=None,
columns=None,
- axis=None,
- method=None,
copy=True,
- level=None,
- fill_value=np.nan,
- limit=None,
- tolerance=None,
+ **kwargs,
):
- axis = self._get_axis_number(axis)
if (
- level is not None
- or (
- (columns is not None or axis == 1)
- and self._query_compiler.has_multiindex(axis=1)
- )
- or (
- (index is not None or axis == 0)
- and self._query_compiler.has_multiindex()
- )
+ kwargs.get("level") is not None
+ or (index is not None and self._query_compiler.has_multiindex())
+ or (columns is not None and self._query_compiler.has_multiindex(axis=1))
):
- return self._default_to_pandas(
- "reindex",
- labels=labels,
- index=index,
- columns=columns,
- axis=axis,
- method=method,
- copy=copy,
- level=level,
- fill_value=fill_value,
- limit=limit,
- tolerance=tolerance,
- )
- if axis == 0 and labels is not None:
- index = labels
- elif labels is not None:
- columns = labels
+ if index is not None:
+ kwargs["index"] = index
+ if columns is not None:
+ kwargs["columns"] = columns
+ return self._default_to_pandas("reindex", copy=copy, **kwargs)
new_query_compiler = None
if index is not None:
if not isinstance(index, pandas.Index):
index = pandas.Index(index)
if not index.equals(self.index):
new_query_compiler = self._query_compiler.reindex(
- axis=0,
- labels=index,
- method=method,
- fill_value=fill_value,
- limit=limit,
- tolerance=tolerance,
+ axis=0, labels=index, **kwargs
)
if new_query_compiler is None:
new_query_compiler = self._query_compiler
@@ -1644,12 +1717,7 @@ def reindex(
columns = pandas.Index(columns)
if not columns.equals(self.columns):
final_query_compiler = new_query_compiler.reindex(
- axis=1,
- labels=columns,
- method=method,
- fill_value=fill_value,
- limit=limit,
- tolerance=tolerance,
+ axis=1, labels=columns, **kwargs
)
if final_query_compiler is None:
final_query_compiler = new_query_compiler
@@ -1970,6 +2038,13 @@ def sample(
query_compiler = self._query_compiler.getitem_row_array(samples)
return self.__constructor__(query_compiler=query_compiler)
+ def sem(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
+ return self._stat_operation(
+ "sem", axis, skipna, level, numeric_only, ddof=ddof, **kwargs
+ )
+
def set_axis(self, labels, axis=0, inplace=False):
if is_scalar(labels):
warnings.warn(
@@ -1988,11 +2063,36 @@ def set_axis(self, labels, axis=0, inplace=False):
obj.set_axis(labels, axis=axis, inplace=True)
return obj
- def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ def set_flags(
+ self, *, copy: bool = False, allows_duplicate_labels: Optional[bool] = None
+ ):
+ return self._default_to_pandas(
+ pandas.DataFrame.set_flags,
+ copy=copy,
+ allows_duplicate_labels=allows_duplicate_labels,
+ )
+
+ @property
+ def flags(self):
+ def flags(df):
+ return df.flags
+
+ return self._default_to_pandas(flags)
+
+ def shift(self, periods=1, freq=None, axis=0, fill_value=no_default):
if periods == 0:
# Check obvious case first
return self.copy()
+ if fill_value is no_default:
+ nan_values = dict()
+ for name, dtype in dict(self.dtypes).items():
+ nan_values[name] = (
+ pandas.NAT if is_datetime_or_timedelta_dtype(dtype) else pandas.NA
+ )
+
+ fill_value = nan_values
+
empty_frame = False
if axis == "index" or axis == 0:
if abs(periods) >= len(self.index):
@@ -2043,6 +2143,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
new_frame.columns = self.columns.copy()
return new_frame
else:
+ if not isinstance(self, DataFrame):
+ raise ValueError(
+ f"No axis named {axis} for object type {type(self)}"
+ )
res_columns = self.columns
from .general import concat
@@ -2059,6 +2163,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
else:
return self.tshift(periods, freq)
+ def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
+ return self._stat_operation("skew", axis, skipna, level, numeric_only, **kwargs)
+
def sort_index(
self,
axis=0,
@@ -2071,6 +2178,11 @@ def sort_index(
ignore_index: bool = False,
key: Optional[IndexKeyFunc] = None,
):
+ # pandas throws this exception. See pandas issie #39434
+ if ascending is None:
+ raise ValueError(
+ "the `axis` parameter is not supported in the pandas implementation of argsort()"
+ )
axis = self._get_axis_number(axis)
inplace = validate_bool_kwarg(inplace, "inplace")
new_query_compiler = self._query_compiler.sort_index(
@@ -2119,6 +2231,13 @@ def sort_values(
)
return self._create_or_update_from_compiler(result, inplace)
+ def std(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
+ return self._stat_operation(
+ "std", axis, skipna, level, numeric_only, ddof=ddof, **kwargs
+ )
+
def sub(self, other, axis="columns", level=None, fill_value=None):
return self._binary_op(
"sub", other, axis=axis, level=level, fill_value=fill_value
@@ -2176,6 +2295,7 @@ def to_csv(
escapechar=None,
decimal=".",
errors: str = "strict",
+ storage_options: StorageOptions = None,
): # pragma: no cover
kwargs = {
@@ -2199,6 +2319,7 @@ def to_csv(
"escapechar": escapechar,
"decimal": decimal,
"errors": errors,
+ "storage_options": storage_options,
}
return self._default_to_pandas("to_csv", **kwargs)
@@ -2223,25 +2344,27 @@ def to_excel(
inf_rep="inf",
verbose=True,
freeze_panes=None,
+ storage_options: StorageOptions = None,
): # pragma: no cover
return self._default_to_pandas(
"to_excel",
excel_writer,
- sheet_name,
- na_rep,
- float_format,
- columns,
- header,
- index,
- index_label,
- startrow,
- startcol,
- engine,
- merge_cells,
- encoding,
- inf_rep,
- verbose,
- freeze_panes,
+ sheet_name=sheet_name,
+ na_rep=na_rep,
+ float_format=float_format,
+ columns=columns,
+ header=header,
+ index=index,
+ index_label=index_label,
+ startrow=startrow,
+ startcol=startcol,
+ engine=engine,
+ merge_cells=merge_cells,
+ encoding=encoding,
+ inf_rep=inf_rep,
+ verbose=verbose,
+ freeze_panes=freeze_panes,
+ storage_options=storage_options,
)
def to_hdf(self, path_or_buf, key, format="table", **kwargs): # pragma: no cover
@@ -2262,6 +2385,7 @@ def to_json(
compression="infer",
index=True,
indent=None,
+ storage_options: StorageOptions = None,
): # pragma: no cover
return self._default_to_pandas(
"to_json",
@@ -2276,6 +2400,7 @@ def to_json(
compression=compression,
index=index,
indent=indent,
+ storage_options=storage_options,
)
def to_latex(
@@ -2301,6 +2426,7 @@ def to_latex(
multirow=None,
caption=None,
label=None,
+ position=None,
): # pragma: no cover
return self._default_to_pandas(
"to_latex",
@@ -2327,9 +2453,21 @@ def to_latex(
label=None,
)
- def to_markdown(self, buf=None, mode=None, index: bool = True, **kwargs):
+ def to_markdown(
+ self,
+ buf=None,
+ mode: str = "wt",
+ index: bool = True,
+ storage_options: StorageOptions = None,
+ **kwargs,
+ ):
return self._default_to_pandas(
- "to_markdown", buf=buf, mode=mode, index=index, **kwargs
+ "to_markdown",
+ buf=buf,
+ mode=mode,
+ index=index,
+ storage_options=storage_options,
+ **kwargs,
)
def to_numpy(self, dtype=None, copy=False, na_value=no_default):
@@ -2344,10 +2482,18 @@ def to_period(self, freq=None, axis=0, copy=True): # pragma: no cover
return self._default_to_pandas("to_period", freq=freq, axis=axis, copy=copy)
def to_pickle(
- self, path, compression="infer", protocol=pkl.HIGHEST_PROTOCOL
+ self,
+ path,
+ compression="infer",
+ protocol=pkl.HIGHEST_PROTOCOL,
+ storage_options: StorageOptions = None,
): # pragma: no cover
return self._default_to_pandas(
- "to_pickle", path, compression=compression, protocol=protocol
+ "to_pickle",
+ path,
+ compression=compression,
+ protocol=protocol,
+ storage_options=storage_options,
)
def to_string(
@@ -2464,7 +2610,10 @@ def tshift(self, periods=1, freq=None, axis=0):
def transform(self, func, axis=0, *args, **kwargs):
kwargs["is_transform"] = True
- result = self.agg(func, axis=axis, *args, **kwargs)
+ try:
+ result = self.agg(func, axis=axis, *args, **kwargs)
+ except Exception:
+ raise ValueError("Transform function failed")
try:
assert len(result) == len(self)
except Exception:
@@ -2500,6 +2649,13 @@ def tz_localize(
)
return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)
+ def var(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
+ return self._stat_operation(
+ "var", axis, skipna, level, numeric_only, ddof=ddof, **kwargs
+ )
+
def __abs__(self):
return self.abs()
@@ -2552,7 +2708,21 @@ def __getitem__(self, key):
else:
return self._getitem(key)
- def _getitem_slice(self, key):
+ def _setitem_slice(self, key: slice, value):
+ """
+ Set rows specified by 'key' slice with 'value'.
+
+ Parameters
+ ----------
+ key: location or index based slice,
+ Key that points rows to modify.
+ value: any,
+ Value to assing to the rows.
+ """
+ indexer = convert_to_index_sliceable(pandas.DataFrame(index=self.index), key)
+ self.iloc[indexer] = value
+
+ def _getitem_slice(self, key: slice):
if key.start is None and key.stop is None:
return self.copy()
return self.iloc[key]
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index 4566eb47284..32ba5b32960 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -30,30 +30,31 @@
is_list_like,
is_numeric_dtype,
)
-from pandas.core.indexes.api import ensure_index_from_sequences
from pandas.util._validators import validate_bool_kwarg
from pandas.io.formats.printing import pprint_thing
from pandas._libs.lib import no_default
-from pandas._typing import Label
+from pandas._typing import Label, StorageOptions
import itertools
import functools
import numpy as np
import sys
-from typing import Optional, Sequence, Tuple, Union, Mapping
+from typing import IO, Optional, Sequence, Tuple, Union, Mapping, Iterator
import warnings
from modin.error_message import ErrorMessage
from modin.utils import _inherit_docstrings, to_pandas, hashable
-from modin.config import IsExperimental
+from modin.config import Engine, IsExperimental
from .utils import (
from_pandas,
from_non_pandas,
)
+from . import _update_engine
from .iterator import PartitionIterator
from .series import Series
from .base import BasePandasDataset, _ATTRS_NO_LOOKUP
from .groupby import DataFrameGroupBy
+from .accessor import CachedAccessor, SparseFrameAccessor
@_inherit_docstrings(pandas.DataFrame, excluded=[pandas.DataFrame.__init__])
@@ -75,7 +76,7 @@ def __init__(
data: NumPy ndarray (structured or homogeneous) or dict:
Dict can contain Series, arrays, constants, or list-like
objects.
- index: pandas.Index, list, ObjectID
+ index: pandas.Index, list
The row index for this DataFrame.
columns: pandas.Index
The column names for this DataFrame, in pandas Index object.
@@ -86,6 +87,7 @@ def __init__(
query_compiler: query_compiler
A query compiler object to manage distributed computation.
"""
+ Engine.subscribe(_update_engine)
if isinstance(data, (DataFrame, Series)):
self._query_compiler = data._query_compiler.copy()
if index is not None and any(i not in data.index for i in index):
@@ -278,7 +280,7 @@ def add_prefix(self, prefix):
def add_suffix(self, suffix):
return DataFrame(query_compiler=self._query_compiler.add_suffix(suffix))
- def applymap(self, func):
+ def applymap(self, func, na_action: Optional[str] = None):
if not callable(func):
raise ValueError("'{0}' object is not callable".format(type(func)))
ErrorMessage.non_verified_udf()
@@ -364,19 +366,16 @@ def groupby(
if callable(by):
by = self.index.map(by)
- elif isinstance(by, str):
+ elif hashable(by) and not isinstance(by, pandas.Grouper):
drop = by in self.columns
idx_name = by
- if (
- self._query_compiler.has_multiindex(axis=axis)
- and by in self.axes[axis].names
- or hasattr(self.axes[axis], "name")
- and self.axes[axis].name == by
- ):
+ if self._query_compiler.has_multiindex(
+ axis=axis
+ ) and by in self._query_compiler.get_index_names(axis):
# In this case we pass the string value of the name through to the
# partitions. This is more efficient than broadcasting the values.
pass
- else:
+ elif level is None:
by = self.__getitem__(by)._query_compiler
elif isinstance(by, Series):
drop = by._parent is self
@@ -384,29 +383,41 @@ def groupby(
by = by._query_compiler
elif is_list_like(by):
# fastpath for multi column groupby
- if (
- not isinstance(by, Series)
- and axis == 0
- and all(
- (
- (isinstance(o, str) and (o in self))
- or (isinstance(o, Series) and (o._parent is self))
- )
- for o in by
+ if axis == 0 and all(
+ (
+ (hashable(o) and (o in self))
+ or isinstance(o, Series)
+ or (is_list_like(o) and len(o) == len(self.axes[axis]))
)
+ for o in by
):
- # We can just revert Series back to names because the parent is
- # this dataframe:
- by = [o.name if isinstance(o, Series) else o for o in by]
- by = self.__getitem__(by)._query_compiler
+ # We want to split 'by's into those that belongs to the self (internal_by)
+ # and those that doesn't (external_by)
+ internal_by, external_by = [], []
+
+ for current_by in by:
+ if hashable(current_by):
+ internal_by.append(current_by)
+ elif isinstance(current_by, Series):
+ if current_by._parent is self:
+ internal_by.append(current_by.name)
+ else:
+ external_by.append(current_by._query_compiler)
+ else:
+ external_by.append(current_by)
+
+ by = internal_by + external_by
+
+ if len(external_by) == 0:
+ by = self[internal_by]._query_compiler
+
drop = True
else:
mismatch = len(by) != len(self.axes[axis])
if mismatch and all(
- isinstance(obj, str)
+ hashable(obj)
and (
- obj in self
- or (hasattr(self.index, "names") and obj in self.index.names)
+ obj in self or obj in self._query_compiler.get_index_names(axis)
)
for obj in by
):
@@ -414,7 +425,7 @@ def groupby(
# we default to pandas in this case.
pass
elif mismatch and any(
- isinstance(obj, str) and obj not in self.columns for obj in by
+ hashable(obj) and obj not in self.columns for obj in by
):
names = [o.name if isinstance(o, Series) else o for o in by]
raise KeyError(next(x for x in names if x not in self))
@@ -554,12 +565,16 @@ def compare(
keep_shape: bool = False,
keep_equal: bool = False,
) -> "DataFrame":
- return self._default_to_pandas(
- pandas.DataFrame.compare,
- other=other,
- align_axis=align_axis,
- keep_shape=keep_shape,
- keep_equal=keep_equal,
+ if not isinstance(other, DataFrame):
+ raise TypeError(f"Cannot compare DataFrame to {type(other)}")
+ other = self._validate_other(other, 0, compare_index=True)
+ return self.__constructor__(
+ query_compiler=self._query_compiler.compare(
+ other,
+ align_axis=align_axis,
+ keep_shape=keep_shape,
+ keep_equal=keep_equal,
+ )
)
def corr(self, method="pearson", min_periods=1):
@@ -774,7 +789,13 @@ def hist(
)
def info(
- self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
+ self,
+ verbose: Optional[bool] = None,
+ buf: Optional[IO[str]] = None,
+ max_cols: Optional[int] = None,
+ memory_usage: Optional[Union[bool, str]] = None,
+ show_counts: Optional[bool] = None,
+ null_counts: Optional[bool] = None,
):
def put_str(src, output_len=None, spaces=2):
src = str(src)
@@ -907,21 +928,18 @@ def insert(self, loc, column, value, allow_duplicates=False):
if isinstance(value, (DataFrame, pandas.DataFrame)):
if len(value.columns) != 1:
raise ValueError("Wrong number of items passed 2, placement implies 1")
- value = value.iloc[:, 0]
-
- if isinstance(value, Series):
- # TODO: Remove broadcast of Series
- value = value._to_pandas()
+ value = value.squeeze(axis=1)
if not self._query_compiler.lazy_execution and len(self.index) == 0:
- try:
- value = pandas.Series(value)
- except (TypeError, ValueError, IndexError):
- raise ValueError(
- "Cannot insert into a DataFrame with no defined index "
- "and a value that cannot be converted to a "
- "Series"
- )
+ if not hasattr(value, "index"):
+ try:
+ value = pandas.Series(value)
+ except (TypeError, ValueError, IndexError):
+ raise ValueError(
+ "Cannot insert into a DataFrame with no defined index "
+ "and a value that cannot be converted to a "
+ "Series"
+ )
new_index = value.index.copy()
new_columns = self.columns.insert(loc, column)
new_query_compiler = DataFrame(
@@ -934,7 +952,7 @@ def insert(self, loc, column, value, allow_duplicates=False):
else:
if (
is_list_like(value)
- and not isinstance(value, pandas.Series)
+ and not isinstance(value, (pandas.Series, Series))
and len(value) != len(self.index)
):
raise ValueError("Length of values does not match length of index")
@@ -948,6 +966,8 @@ def insert(self, loc, column, value, allow_duplicates=False):
)
if loc < 0:
raise ValueError("unbounded slice")
+ if isinstance(value, Series):
+ value = value._query_compiler
new_query_compiler = self._query_compiler.insert(loc, column, value)
self._update_inplace(new_query_compiler=new_query_compiler)
@@ -1068,30 +1088,6 @@ def lt(self, other, axis="columns", level=None):
"lt", other, axis=axis, level=level, broadcast=isinstance(other, Series)
)
- def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.median(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.median(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
def melt(
self,
id_vars=None,
@@ -1345,6 +1341,17 @@ def prod(
**kwargs,
):
axis = self._get_axis_number(axis)
+ if level is not None:
+ return self._default_to_pandas(
+ "prod",
+ axis=axis,
+ skipna=skipna,
+ level=level,
+ numeric_only=numeric_only,
+ min_count=min_count,
+ **kwargs,
+ )
+
axis_to_apply = self.columns if axis else self.index
if (
skipna is not False
@@ -1357,17 +1364,6 @@ def prod(
)
data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
- if level is not None:
- return data.__constructor__(
- query_compiler=data._query_compiler.prod_min_count(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- min_count=min_count,
- **kwargs,
- )
- )
if min_count > 1:
return data._reduce_dimension(
data._query_compiler.prod_min_count(
@@ -1400,6 +1396,35 @@ def query(self, expr, inplace=False, **kwargs):
new_query_compiler = self._query_compiler.query(expr, **kwargs)
return self._create_or_update_from_compiler(new_query_compiler, inplace)
+ def reindex(
+ self,
+ labels=None,
+ index=None,
+ columns=None,
+ axis=None,
+ method=None,
+ copy=True,
+ level=None,
+ fill_value=np.nan,
+ limit=None,
+ tolerance=None,
+ ):
+ axis = self._get_axis_number(axis)
+ if axis == 0 and labels is not None:
+ index = labels
+ elif labels is not None:
+ columns = labels
+ return super(DataFrame, self).reindex(
+ index=index,
+ columns=columns,
+ method=method,
+ copy=copy,
+ level=level,
+ fill_value=fill_value,
+ limit=limit,
+ tolerance=tolerance,
+ )
+
def rename(
self,
mapper=None,
@@ -1558,121 +1583,59 @@ def is_dtype_instance_mapper(column, dtype):
]
return self.drop(columns=self.columns[indicate], inplace=False)
- def sem(
- self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
- ):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.sem(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.sem(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
def set_index(
self, keys, drop=True, append=False, inplace=False, verify_integrity=False
):
inplace = validate_bool_kwarg(inplace, "inplace")
if not isinstance(keys, list):
keys = [keys]
- if inplace:
- frame = self
- else:
- frame = self.copy()
-
- arrays = []
- names = []
- if append:
- names = [x for x in self.index.names]
- if self._query_compiler.has_multiindex():
- for i in range(self.index.nlevels):
- arrays.append(self.index._get_level_values(i))
- else:
- arrays.append(self.index)
- to_remove = []
- for col in keys:
- if isinstance(col, pandas.MultiIndex):
- # append all but the last column so we don't have to modify
- # the end of this loop
- for n in range(col.nlevels - 1):
- arrays.append(col._get_level_values(n))
-
- level = col._get_level_values(col.nlevels - 1)
- names.extend(col.names)
- elif isinstance(col, pandas.Series):
- level = col._values
- names.append(col.name)
- elif isinstance(col, pandas.Index):
- level = col
- names.append(col.name)
- elif isinstance(col, (list, np.ndarray, pandas.Index)):
- level = col
- names.append(None)
- else:
- level = frame[col]._to_pandas()._values
- names.append(col)
- if drop:
- to_remove.append(col)
- arrays.append(level)
- index = ensure_index_from_sequences(arrays, names)
-
- if verify_integrity and not index.is_unique:
- duplicates = index.get_duplicates()
- raise ValueError("Index has duplicate keys: %s" % duplicates)
-
- for c in to_remove:
- del frame[c]
- # clear up memory usage
- index._cleanup()
- frame.index = index
-
- if not inplace:
- return frame
- def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.skew(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
+ if any(
+ isinstance(col, (pandas.Index, Series, np.ndarray, list, Iterator))
+ for col in keys
+ ):
+ # The current implementation cannot mix a list column labels and list like
+ # objects.
+ if not all(
+ isinstance(col, (pandas.Index, Series, np.ndarray, list, Iterator))
+ for col in keys
+ ):
+ return self._default_to_pandas(
+ "set_index",
+ keys,
+ drop=drop,
+ append=append,
+ inplace=inplace,
+ verify_integrity=verify_integrity,
)
+ if inplace:
+ frame = self
+ else:
+ frame = self.copy()
+ # These are single-threaded objects, so we might as well let pandas do the
+ # calculation so that it matches.
+ frame.index = (
+ pandas.DataFrame(index=self.index)
+ .set_index(keys, append=append, verify_integrity=verify_integrity)
+ .index
)
- return self._reduce_dimension(
- self._query_compiler.skew(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
- )
+ if not inplace:
+ return frame
+ else:
+ return
+ new_query_compiler = self._query_compiler.set_index_from_columns(
+ keys, drop=drop, append=append
)
- @property
- def sparse(self):
- return self._default_to_pandas(pandas.DataFrame.sparse)
+ if verify_integrity and not new_query_compiler.index.is_unique:
+ duplicates = new_query_compiler.index[
+ new_query_compiler.index.duplicated()
+ ].unique()
+ raise ValueError(f"Index has duplicate keys: {duplicates}")
+
+ return self._create_or_update_from_compiler(new_query_compiler, inplace=inplace)
+
+ sparse = CachedAccessor("sparse", SparseFrameAccessor)
def squeeze(self, axis=None):
axis = self._get_axis_number(axis) if axis is not None else None
@@ -1685,34 +1648,6 @@ def squeeze(self, axis=None):
else:
return self.copy()
- def std(
- self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
- ):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.std(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.std(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
def stack(self, level=-1, dropna=True):
if not isinstance(self.columns, pandas.MultiIndex) or (
isinstance(self.columns, pandas.MultiIndex)
@@ -1762,15 +1697,14 @@ def sum(
axis, numeric_only, ignore_axis=False
)
if level is not None:
- return data.__constructor__(
- query_compiler=data._query_compiler.sum_min_count(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- min_count=min_count,
- **kwargs,
- )
+ return self._default_to_pandas(
+ "sum",
+ axis=axis,
+ skipna=skipna,
+ level=level,
+ numeric_only=numeric_only,
+ min_count=min_count,
+ **kwargs,
)
if min_count > 1:
return data._reduce_dimension(
@@ -1879,11 +1813,12 @@ def to_html(
def to_parquet(
self,
- path,
+ path=None,
engine="auto",
compression="snappy",
index=None,
partition_cols=None,
+ storage_options: StorageOptions = None,
**kwargs,
): # pragma: no cover
return self._default_to_pandas(
@@ -1893,6 +1828,7 @@ def to_parquet(
compression=compression,
index=index,
partition_cols=partition_cols,
+ storage_options=storage_options,
**kwargs,
)
@@ -1919,6 +1855,7 @@ def to_stata(
version=114,
convert_strl=None,
compression: Union[str, Mapping[str, str], None] = "infer",
+ storage_options: StorageOptions = None,
): # pragma: no cover
return self._default_to_pandas(
pandas.DataFrame.to_stata,
@@ -1932,6 +1869,7 @@ def to_stata(
version=version,
convert_strl=convert_strl,
compression=compression,
+ storage_options=storage_options,
)
def to_timestamp(self, freq=None, how="start", axis=0, copy=True):
@@ -1965,34 +1903,6 @@ def update(
)
self._update_inplace(new_query_compiler=query_compiler)
- def var(
- self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
- ):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.var(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.var(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
def value_counts(
self,
subset: Optional[Sequence[Label]] = None,
@@ -2098,16 +2008,12 @@ def __setattr__(self, key, value):
object.__setattr__(self, key, value)
def __setitem__(self, key, value):
+ if isinstance(key, slice):
+ return self._setitem_slice(key, value)
+
if hashable(key) and key not in self.columns:
- # Handle new column case first
- if isinstance(value, Series):
- if len(self.columns) == 0:
- self._query_compiler = value._query_compiler.copy()
- else:
- self._create_or_update_from_compiler(
- self._query_compiler.concat(1, value._query_compiler),
- inplace=True,
- )
+ if isinstance(value, Series) and len(self.columns) == 0:
+ self._query_compiler = value._query_compiler.copy()
# Now that the data is appended, we need to update the column name for
# that column to `key`, otherwise the name could be incorrect. Drop the
# last column name from the list (the appended value's name and append
@@ -2135,8 +2041,7 @@ def __setitem__(self, key, value):
self.insert(loc=len(self.columns), column=key, value=value)
return
- if not isinstance(key, str):
-
+ if not hashable(key):
if isinstance(key, DataFrame) or isinstance(key, np.ndarray):
if isinstance(key, np.ndarray):
if key.shape != self.shape:
@@ -2144,7 +2049,7 @@ def __setitem__(self, key, value):
key = DataFrame(key, columns=self.columns)
return self.mask(key, value, inplace=True)
- def setitem_without_string_columns(df):
+ def setitem_unhashable_key(df):
# Arrow makes memory-mapped objects immutable, so copy will allow them
# to be mutable again.
df = df.copy(True)
@@ -2152,7 +2057,7 @@ def setitem_without_string_columns(df):
return df
return self._update_inplace(
- self._default_to_pandas(setitem_without_string_columns)._query_compiler
+ self._default_to_pandas(setitem_unhashable_key)._query_compiler
)
if is_list_like(value):
if isinstance(value, (pandas.DataFrame, DataFrame)):
@@ -2216,7 +2121,6 @@ def __delitem__(self, key):
__mod__ = mod
__imod__ = mod # pragma: no cover
__rmod__ = rmod
- __div__ = div
__rdiv__ = rdiv
@property
@@ -2266,6 +2170,30 @@ def _create_or_update_from_compiler(self, new_query_compiler, inplace=False):
else:
self._update_inplace(new_query_compiler=new_query_compiler)
+ def _get_numeric_data(self, axis: int):
+ """
+ Grabs only numeric columns from frame.
+
+ Parameters
+ ----------
+ axis: int
+ Axis to inspect on having numeric types only.
+ If axis is not 0, returns the frame itself.
+
+ Returns
+ -------
+ DataFrame with numeric data.
+ """
+ # Pandas ignores `numeric_only` if `axis` is 1, but we do have to drop
+ # non-numeric columns if `axis` is 0.
+ if axis != 0:
+ return self
+ return self.drop(
+ columns=[
+ i for i in self.dtypes.index if not is_numeric_dtype(self.dtypes[i])
+ ]
+ )
+
def _validate_dtypes(self, numeric_only=False):
"""
Help to check that all the dtypes are the same.
@@ -2305,16 +2233,12 @@ def _validate_dtypes_min_max(self, axis, numeric_only):
for dtype in self.dtypes
):
raise TypeError("Cannot compare Numeric and Non-Numeric Types")
- # Pandas ignores `numeric_only` if `axis` is 1, but we do have to drop
- # non-numeric columns if `axis` is 0.
- if numeric_only and axis == 0:
- return self.drop(
- columns=[
- i for i in self.dtypes.index if not is_numeric_dtype(self.dtypes[i])
- ]
- )
- else:
- return self
+
+ return (
+ self._get_numeric_data(axis)
+ if numeric_only is None or numeric_only
+ else self
+ )
def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
"""
@@ -2363,16 +2287,12 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
for dtype in self.dtypes
):
raise TypeError("Cannot operate on Numeric and Non-Numeric Types")
- # Pandas ignores `numeric_only` if `axis` is 1, but we do have to drop
- # non-numeric columns if `axis` is 0.
- if numeric_only and axis == 0:
- return self.drop(
- columns=[
- i for i in self.dtypes.index if not is_numeric_dtype(self.dtypes[i])
- ]
- )
- else:
- return self
+
+ return (
+ self._get_numeric_data(axis)
+ if numeric_only is None or numeric_only
+ else self
+ )
def _to_pandas(self):
return self._query_compiler.to_pandas()
diff --git a/modin/pandas/general.py b/modin/pandas/general.py
index 581579cafe6..1aa39f9e731 100644
--- a/modin/pandas/general.py
+++ b/modin/pandas/general.py
@@ -159,27 +159,146 @@ def merge_asof(
"can not merge DataFrame with instance of type {}".format(type(right))
)
ErrorMessage.default_to_pandas("`merge_asof`")
- if isinstance(right, DataFrame):
- right = to_pandas(right)
- return DataFrame(
- pandas.merge_asof(
- to_pandas(left),
- right,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- by=by,
- left_by=left_by,
- right_by=right_by,
- suffixes=suffixes,
- tolerance=tolerance,
- allow_exact_matches=allow_exact_matches,
- direction=direction,
+
+ # As of Pandas 1.2 these should raise an error; before that it did
+ # something likely random:
+ if (
+ (on and (left_index or right_index))
+ or (left_on and left_index)
+ or (right_on and right_index)
+ ):
+ raise ValueError("Can't combine left/right_index with left/right_on or on.")
+
+ # Pandas fallbacks for tricky cases:
+ if (
+ # No idea how this works or why it does what it does; and in fact
+ # there's a Pandas bug suggesting it's wrong:
+ # https://github.com/pandas-dev/pandas/issues/33463
+ (left_index and right_on is not None)
+ # This is the case where by is a list of columns. If we're copying lots
+ # of columns out of Pandas, maybe not worth trying our path, it's not
+ # clear it's any better:
+ or not isinstance(by, (str, type(None)))
+ or not isinstance(left_by, (str, type(None)))
+ or not isinstance(right_by, (str, type(None)))
+ ):
+ if isinstance(right, DataFrame):
+ right = to_pandas(right)
+ return DataFrame(
+ pandas.merge_asof(
+ to_pandas(left),
+ right,
+ on=on,
+ left_on=left_on,
+ right_on=right_on,
+ left_index=left_index,
+ right_index=right_index,
+ by=by,
+ left_by=left_by,
+ right_by=right_by,
+ suffixes=suffixes,
+ tolerance=tolerance,
+ allow_exact_matches=allow_exact_matches,
+ direction=direction,
+ )
)
+
+ left_column = None
+ right_column = None
+
+ if on is not None:
+ if left_on is not None or right_on is not None:
+ raise ValueError("If 'on' is set, 'left_on' and 'right_on' can't be set.")
+ left_on = on
+ right_on = on
+
+ if left_on is not None:
+ left_column = to_pandas(left[left_on])
+ elif left_index:
+ left_column = left.index
+ else:
+ raise ValueError("Need some sort of 'on' spec")
+
+ if right_on is not None:
+ right_column = to_pandas(right[right_on])
+ elif right_index:
+ right_column = right.index
+ else:
+ raise ValueError("Need some sort of 'on' spec")
+
+ # If we haven't set these by now, there's a bug in this function.
+ assert left_column is not None
+ assert right_column is not None
+
+ if by is not None:
+ if left_by is not None or right_by is not None:
+ raise ValueError("Can't have both 'by' and 'left_by' or 'right_by'")
+ left_by = right_by = by
+
+ # List of columns case should have been handled by direct Pandas fallback
+ # earlier:
+ assert isinstance(left_by, (str, type(None)))
+ assert isinstance(right_by, (str, type(None)))
+
+ left_pandas_limited = {"on": left_column}
+ right_pandas_limited = {"on": right_column, "right_labels": right.index}
+ extra_kwargs = {} # extra arguments to Pandas merge_asof
+
+ if left_by is not None or right_by is not None:
+ extra_kwargs["by"] = "by"
+ left_pandas_limited["by"] = to_pandas(left[left_by])
+ right_pandas_limited["by"] = to_pandas(right[right_by])
+
+ # 1. Construct Pandas DataFrames with just the 'on' and optional 'by'
+ # columns, and the index as another column.
+ left_pandas_limited = pandas.DataFrame(left_pandas_limited, index=left.index)
+ right_pandas_limited = pandas.DataFrame(right_pandas_limited)
+
+ # 2. Use Pandas' merge_asof to figure out how to map labels on left to
+ # labels on the right.
+ merged = pandas.merge_asof(
+ left_pandas_limited,
+ right_pandas_limited,
+ on="on",
+ direction=direction,
+ allow_exact_matches=allow_exact_matches,
+ tolerance=tolerance,
+ **extra_kwargs,
+ )
+ # Now merged["right_labels"] shows which labels from right map to left's index.
+
+ # 3. Re-index right using the merged["right_labels"]; at this point right
+ # should be same length and (semantically) same order as left:
+ right_subset = right.reindex(index=pandas.Index(merged["right_labels"]))
+ if not right_index:
+ right_subset.drop(columns=[right_on], inplace=True)
+ if right_by is not None and left_by == right_by:
+ right_subset.drop(columns=[right_by], inplace=True)
+ right_subset.index = left.index
+
+ # 4. Merge left and the new shrunken right:
+ result = merge(
+ left,
+ right_subset,
+ left_index=True,
+ right_index=True,
+ suffixes=suffixes,
+ how="left",
)
+ # 5. Clean up to match Pandas output:
+ if left_on is not None and right_index:
+ result.insert(
+ # In theory this could use get_indexer_for(), but that causes an error:
+ list(result.columns).index(left_on + suffixes[0]),
+ left_on,
+ result[left_on + suffixes[0]],
+ )
+ if not left_index and not right_index:
+ result.index = pandas.RangeIndex(start=0, stop=len(result))
+
+ return result
+
@_inherit_docstrings(pandas.pivot_table)
def pivot_table(
diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py
index ef68c6d94d9..e7b7c99d07b 100644
--- a/modin/pandas/groupby.py
+++ b/modin/pandas/groupby.py
@@ -22,13 +22,23 @@
Manually add documentation for methods which are not presented in pandas.
"""
+import numpy as np
import pandas
import pandas.core.groupby
from pandas.core.dtypes.common import is_list_like
+from pandas.core.aggregation import reconstruct_func
import pandas.core.common as com
+from types import BuiltinFunctionType
+from collections.abc import Iterable
from modin.error_message import ErrorMessage
-from modin.utils import _inherit_docstrings, wrap_udf_function, try_cast_to_pandas
+from modin.utils import (
+ _inherit_docstrings,
+ try_cast_to_pandas,
+ wrap_udf_function,
+ hashable,
+)
+from modin.backends.base.query_compiler import BaseQueryCompiler
from modin.config import IsExperimental
from .series import Series
@@ -74,8 +84,9 @@ def __init__(
not isinstance(by, type(self._query_compiler))
and axis == 0
and all(
- (isinstance(obj, str) and obj in self._query_compiler.columns)
- or isinstance(obj, Series)
+ (hashable(obj) and obj in self._query_compiler.columns)
+ or isinstance(obj, type(self._query_compiler))
+ or is_list_like(obj)
for obj in self._by
)
)
@@ -87,8 +98,8 @@ def __init__(
"sort": sort,
"as_index": as_index,
"group_keys": group_keys,
- "squeeze": squeeze,
}
+ self._squeeze = squeeze
self._kwargs.update(kwargs)
_index_grouped_cache = None
@@ -110,7 +121,7 @@ def __getattr__(self, key):
return object.__getattribute__(self, key)
except AttributeError as e:
if key in self._columns:
- return self._default_to_pandas(lambda df: df.__getitem__(key))
+ return self.__getitem__(key)
raise e
@property
@@ -170,10 +181,57 @@ def idxmax(self):
def ndim(self):
return 2 # ndim is always 2 for DataFrames
- def shift(self, periods=1, freq=None, axis=0):
- return self._default_to_pandas(
- lambda df: df.shift(periods=periods, freq=freq, axis=axis)
- )
+ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ def _shift(periods, freq, axis, fill_value, is_set_nan_rows=True):
+ from .dataframe import DataFrame
+
+ result = self._df.shift(periods, freq, axis, fill_value)
+
+ if (
+ is_set_nan_rows
+ and isinstance(self._by, BaseQueryCompiler)
+ and (
+ # Check using `issubset` is effective only in case of MultiIndex
+ set(self._by.columns).issubset(list(self._df.columns))
+ if isinstance(self._by.columns, pandas.MultiIndex)
+ else len(
+ self._by.columns.unique()
+ .sort_values()
+ .difference(self._df.columns.unique().sort_values())
+ )
+ == 0
+ )
+ and DataFrame(query_compiler=self._by.isna()).any(axis=None)
+ ):
+ mask_nan_rows = self._df[self._by.columns].isna().any(axis=1)
+ # drop NaN groups
+ result = result.loc[~mask_nan_rows]
+ return result
+
+ if freq is None and axis == 1 and self._axis == 0:
+ result = _shift(periods, freq, axis, fill_value)
+ elif (
+ freq is not None
+ and axis == 0
+ and self._axis == 0
+ and isinstance(self._by, BaseQueryCompiler)
+ ):
+ result = _shift(periods, freq, axis, fill_value, is_set_nan_rows=False)
+ new_idx_lvl_arrays = np.concatenate(
+ [self._df[self._by.columns].values.T, [list(result.index)]]
+ )
+ result.index = pandas.MultiIndex.from_arrays(
+ new_idx_lvl_arrays,
+ names=[col_name for col_name in self._by.columns]
+ + [result._query_compiler.get_index_name()],
+ )
+ result = result.dropna(subset=self._by.columns).sort_index()
+ else:
+ result = self._apply_agg_function(
+ lambda df: df.shift(periods, freq, axis, fill_value)
+ )
+ result._query_compiler.set_index_name(None)
+ return result
def nth(self, n, dropna=None):
return self._default_to_pandas(lambda df: df.nth(n, dropna=dropna))
@@ -181,7 +239,7 @@ def nth(self, n, dropna=None):
def cumsum(self, axis=0, *args, **kwargs):
result = self._apply_agg_function(lambda df: df.cumsum(axis, *args, **kwargs))
# pandas does not name the index on cumsum
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
@property
@@ -199,21 +257,22 @@ def filter(self, func, dropna=True, *args, **kwargs):
def cummax(self, axis=0, **kwargs):
result = self._apply_agg_function(lambda df: df.cummax(axis, **kwargs))
# pandas does not name the index on cummax
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
def apply(self, func, *args, **kwargs):
- return self._apply_agg_function(
- # Grouping column in never dropped in groupby.apply, so drop=False
- lambda df: df.apply(func, *args, **kwargs),
- drop=False,
- )
+ if not isinstance(func, BuiltinFunctionType):
+ func = wrap_udf_function(func)
+ return self._apply_agg_function(lambda df: df.apply(func, *args, **kwargs))
@property
def dtypes(self):
if self._axis == 1:
raise ValueError("Cannot call dtypes on groupby with axis=1")
- return self._apply_agg_function(lambda df: df.dtypes, drop=self._as_index)
+ if not self._as_index:
+ return self.apply(lambda df: df.dtypes)
+ else:
+ return self._apply_agg_function(lambda df: df.dtypes)
def first(self, **kwargs):
return self._default_to_pandas(lambda df: df.first(**kwargs))
@@ -222,7 +281,7 @@ def backfill(self, limit=None):
return self.bfill(limit)
def __getitem__(self, key):
- kwargs = self._kwargs.copy()
+ kwargs = {**self._kwargs.copy(), "squeeze": self._squeeze}
# Most of time indexing DataFrameGroupBy results in another DataFrameGroupBy object unless circumstances are
# special in which case SeriesGroupBy has to be returned. Such circumstances are when key equals to a single
# column name and is not a list of column names or list of one column name.
@@ -258,7 +317,7 @@ def __getitem__(self, key):
if (
self._is_multi_by
and isinstance(self._by, list)
- and not all(isinstance(o, str) for o in self._by)
+ and not all(hashable(o) and o in self._df for o in self._by)
):
raise NotImplementedError(
"Column lookups on GroupBy with arbitrary Series in by"
@@ -276,7 +335,7 @@ def __getitem__(self, key):
def cummin(self, axis=0, **kwargs):
result = self._apply_agg_function(lambda df: df.cummin(axis=axis, **kwargs))
# pandas does not name the index on cummin
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
def bfill(self, limit=None):
@@ -300,55 +359,78 @@ def aggregate(self, func=None, *args, **kwargs):
# This is not implemented in pandas,
# so we throw a different message
raise NotImplementedError("axis other than 0 is not supported")
+
+ if (
+ callable(func)
+ and isinstance(func, BuiltinFunctionType)
+ and func.__name__ in dir(self)
+ ):
+ func = func.__name__
+
+ relabeling_required = False
if isinstance(func, dict) or func is None:
- if func is None:
- func = {}
- else:
- if any(i not in self._df.columns for i in func.keys()):
- from pandas.core.base import SpecificationError
- raise SpecificationError("nested renamer is not supported")
- if isinstance(self._by, type(self._query_compiler)):
- by = list(self._by.columns)
- else:
- by = self._by
- # We convert to the string version of the function for simplicity.
- func_dict = {
- k: v if not callable(v) or v.__name__ not in dir(self) else v.__name__
- for k, v in func.items()
- }
- subset_cols = list(func_dict.keys()) + (
- list(self._by.columns)
- if isinstance(self._by, type(self._query_compiler))
- and all(c in self._df.columns for c in self._by.columns)
- else []
- )
- return type(self._df)(
- query_compiler=self._df[subset_cols]._query_compiler.groupby_dict_agg(
- by=by,
- func_dict=func_dict,
- groupby_args=self._kwargs,
- agg_args=kwargs,
- drop=self._drop,
- )
+ def try_get_str_func(fn):
+ if not isinstance(fn, str) and isinstance(fn, Iterable):
+ return [try_get_str_func(f) for f in fn]
+ return fn.__name__ if callable(fn) and fn.__name__ in dir(self) else fn
+
+ relabeling_required, func_dict, new_columns, order = reconstruct_func(
+ func, **kwargs
)
- if is_list_like(func):
+ func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()}
+
+ if any(i not in self._df.columns for i in func_dict.keys()):
+ from pandas.core.base import SpecificationError
+
+ raise SpecificationError("nested renamer is not supported")
+ if func is None:
+ kwargs = {}
+ func = func_dict
+ elif is_list_like(func):
return self._default_to_pandas(
lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
*args,
**kwargs,
)
- if isinstance(func, str):
- agg_func = getattr(self, func, None)
+ elif callable(func):
+ return self._apply_agg_function(
+ lambda grp, *args, **kwargs: grp.aggregate(func, *args, **kwargs),
+ *args,
+ **kwargs,
+ )
+ elif isinstance(func, str):
+ # Using "getattr" here masks possible AttributeError which we throw
+ # in __getattr__, so we should call __getattr__ directly instead.
+ agg_func = self.__getattr__(func)
if callable(agg_func):
return agg_func(*args, **kwargs)
- return self._apply_agg_function(
- lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
- drop=self._as_index,
+
+ result = self._apply_agg_function(
+ func,
*args,
**kwargs,
)
+ if relabeling_required:
+ if not self._as_index:
+ nby_cols = len(result.columns) - len(new_columns)
+ order = np.concatenate([np.arange(nby_cols), order + nby_cols])
+ by_cols = result.columns[:nby_cols]
+ new_columns = pandas.Index(new_columns)
+ if by_cols.nlevels != new_columns.nlevels:
+ by_cols = by_cols.remove_unused_levels()
+ empty_levels = [
+ i
+ for i, level in enumerate(by_cols.levels)
+ if len(level) == 1 and level[0] == ""
+ ]
+ by_cols = by_cols.droplevel(empty_levels)
+ new_columns = by_cols.append(new_columns)
+ result = result.iloc[:, order]
+ result.columns = new_columns
+ return result
+
agg = aggregate
def last(self, **kwargs):
@@ -360,7 +442,7 @@ def mad(self, **kwargs):
def rank(self, **kwargs):
result = self._apply_agg_function(lambda df: df.rank(**kwargs))
# pandas does not name the index on rank
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
@property
@@ -400,13 +482,16 @@ def size(self):
# Series objects in 'by' mean we couldn't handle the case
# and transform 'by' to a query compiler.
# In this case we are just defaulting to pandas.
- if is_list_like(self._by) and any(isinstance(o, Series) for o in self._by):
+ if is_list_like(self._by) and any(
+ isinstance(o, type(self._df._query_compiler)) for o in self._by
+ ):
work_object = DataFrameGroupBy(
self._df,
self._by,
self._axis,
drop=False,
idx_name=None,
+ squeeze=self._squeeze,
**self._kwargs,
)
result = work_object._wrap_aggregation(
@@ -429,6 +514,7 @@ def size(self):
self._axis,
drop=False,
idx_name=None,
+ squeeze=self._squeeze,
**self._kwargs,
)
result = work_object._wrap_aggregation(
@@ -455,6 +541,7 @@ def size(self):
0,
drop=self._drop,
idx_name=self._idx_name,
+ squeeze=self._squeeze,
**self._kwargs,
).size()
@@ -514,7 +601,7 @@ def head(self, n=5):
def cumprod(self, axis=0, *args, **kwargs):
result = self._apply_agg_function(lambda df: df.cumprod(axis, *args, **kwargs))
# pandas does not name the index on cumprod
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
def __iter__(self):
@@ -528,16 +615,27 @@ def transform(self, func, *args, **kwargs):
lambda df: df.transform(func, *args, **kwargs)
)
# pandas does not name the index on transform
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
def corr(self, **kwargs):
return self._default_to_pandas(lambda df: df.corr(**kwargs))
def fillna(self, **kwargs):
- result = self._apply_agg_function(lambda df: df.fillna(**kwargs))
+ new_groupby_kwargs = self._kwargs.copy()
+ new_groupby_kwargs["as_index"] = True
+ work_object = type(self)(
+ df=self._df,
+ by=self._by,
+ axis=self._axis,
+ idx_name=self._idx_name,
+ drop=self._drop,
+ squeeze=self._squeeze,
+ **new_groupby_kwargs,
+ )
+ result = work_object._apply_agg_function(lambda df: df.fillna(**kwargs))
# pandas does not name the index on fillna
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
def count(self, **kwargs):
@@ -558,7 +656,7 @@ def pipe(self, func, *args, **kwargs):
def cumcount(self, ascending=True):
result = self._default_to_pandas(lambda df: df.cumcount(ascending=ascending))
# pandas does not name the index on cumcount
- result.index.name = None
+ result._query_compiler.set_index_name(None)
return result
def tail(self, n=5):
@@ -729,12 +827,19 @@ def _index_grouped(self):
# aware.
ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
ErrorMessage.default_to_pandas("Groupby with multiple columns")
- if isinstance(by, list) and all(isinstance(o, str) for o in by):
+ if isinstance(by, list) and all(
+ hashable(o)
+ and (
+ o in self._df
+ or o in self._df._query_compiler.get_index_names(self._axis)
+ )
+ for o in by
+ ):
pandas_df = self._df._query_compiler.getitem_column_array(
by
).to_pandas()
else:
- by = try_cast_to_pandas(by)
+ by = try_cast_to_pandas(by, squeeze=True)
pandas_df = self._df._to_pandas()
self._index_grouped_cache = pandas_df.groupby(by=by).groups
else:
@@ -777,7 +882,7 @@ def _wrap_aggregation(
# For aggregations, pandas behavior does this for the result.
# For other operations it does not, so we wait until there is an aggregation to
# actually perform this operation.
- if drop and self._drop and self._as_index:
+ if not self._is_multi_by and drop and self._drop and self._as_index:
groupby_qc = self._query_compiler.drop(columns=self._by.columns)
else:
groupby_qc = self._query_compiler
@@ -794,11 +899,11 @@ def _wrap_aggregation(
drop=self._drop,
)
)
- if self._kwargs.get("squeeze", False):
+ if self._squeeze:
return result.squeeze()
return result
- def _apply_agg_function(self, f, drop=True, *args, **kwargs):
+ def _apply_agg_function(self, f, *args, **kwargs):
"""
Perform aggregation and combine stages based on a given function.
@@ -806,45 +911,33 @@ def _apply_agg_function(self, f, drop=True, *args, **kwargs):
Parameters
----------
- f:
+ f: callable
The function to apply to each group.
Returns
-------
A new combined DataFrame with the result of all groups.
"""
- assert callable(f), "'{0}' object is not callable".format(type(f))
-
- f = wrap_udf_function(f)
- if self._is_multi_by:
- return self._default_to_pandas(f, *args, **kwargs)
-
- if isinstance(self._by, type(self._query_compiler)):
- by = self._by.to_pandas().squeeze()
- else:
- by = self._by
+ assert callable(f) or isinstance(
+ f, dict
+ ), "'{0}' object is not callable and not a dict".format(type(f))
- # For aggregations, pandas behavior does this for the result.
- # For other operations it does not, so we wait until there is an aggregation to
- # actually perform this operation.
- if self._idx_name is not None and drop and self._drop:
- groupby_qc = self._query_compiler.drop(columns=[self._idx_name])
- else:
- groupby_qc = self._query_compiler
- new_manager = groupby_qc.groupby_agg(
- by=by,
+ new_manager = self._query_compiler.groupby_agg(
+ by=self._by,
+ is_multi_by=self._is_multi_by,
axis=self._axis,
agg_func=f,
- groupby_args=self._kwargs,
- agg_args=kwargs,
+ agg_args=args,
+ agg_kwargs=kwargs,
+ groupby_kwargs=self._kwargs,
drop=self._drop,
)
if self._idx_name is not None and self._as_index:
- new_manager.index.name = self._idx_name
+ new_manager.set_index_name(self._idx_name)
result = type(self._df)(query_compiler=new_manager)
- if result.index.name == "__reduced__":
- result.index.name = None
- if self._kwargs.get("squeeze", False):
+ if result._query_compiler.get_index_name() == "__reduced__":
+ result._query_compiler.set_index_name(None)
+ if self._squeeze:
return result.squeeze()
return result
@@ -873,11 +966,15 @@ def _default_to_pandas(self, f, *args, **kwargs):
else:
by = self._by
- by = try_cast_to_pandas(by)
+ by = try_cast_to_pandas(by, squeeze=True)
def groupby_on_multiple_columns(df, *args, **kwargs):
return f(
- df.groupby(by=by, axis=self._axis, **self._kwargs), *args, **kwargs
+ df.groupby(
+ by=by, axis=self._axis, squeeze=self._squeeze, **self._kwargs
+ ),
+ *args,
+ **kwargs,
)
return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs)
diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
index c0922d4c378..d5ca2cb297b 100644
--- a/modin/pandas/indexing.py
+++ b/modin/pandas/indexing.py
@@ -60,6 +60,26 @@ def is_slice(x):
return isinstance(x, slice)
+def compute_sliced_len(slc, sequence_len):
+ """
+ Compute length of sliced object.
+
+ Parameters
+ ----------
+ slc: slice
+ Slice object
+ sequence_len: int
+ Length of sequence, to which slice will be applied
+
+ Returns
+ -------
+ int
+ Length of object after applying slice object on it.
+ """
+ # This will translate slice to a range, from which we can retrieve length
+ return len(range(*slc.indices(sequence_len)))
+
+
def is_2d(x):
"""
Implement [METHOD_NAME].
@@ -293,7 +313,7 @@ def __getitem__(self, row_lookup, col_lookup, ndim):
)
return self.df.__constructor__(query_compiler=qc_view).squeeze(axis=axis)
- def __setitem__(self, row_lookup, col_lookup, item):
+ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
"""
Implement [METHOD_NAME].
@@ -317,15 +337,11 @@ def __setitem__(self, row_lookup, col_lookup, item):
col_lookup = range(len(self.qc.columns))[col_lookup]
# This is True when we dealing with assignment of a full column. This case
# should be handled in a fastpath with `df[col] = item`.
- if (
- len(row_lookup) == len(self.qc.index)
- and len(col_lookup) == 1
- and hasattr(self.df, "columns")
- ):
+ if axis == 0:
self.df[self.df.columns[col_lookup][0]] = item
# This is True when we are assigning to a full row. We want to reuse the setitem
# mechanism to operate along only one axis for performance reasons.
- elif len(col_lookup) == len(self.qc.columns) and len(row_lookup) == 1:
+ elif axis == 1:
if hasattr(item, "_query_compiler"):
item = item._query_compiler
new_qc = self.qc.setitem(1, self.qc.index[row_lookup[0]], item)
@@ -368,21 +384,25 @@ def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
"""
# It is valid to pass a DataFrame or Series to __setitem__ that is larger than
# the target the user is trying to overwrite. This
- if isinstance(item, (pandas.Series, pandas.DataFrame, DataFrame)):
- if not all(idx in item.index for idx in row_lookup):
+ if isinstance(item, (pandas.Series, pandas.DataFrame, Series, DataFrame)):
+ # convert indices in lookups to names, as Pandas reindex expects them to be so
+ index_values = self.qc.index[row_lookup]
+ if not all(idx in item.index for idx in index_values):
raise ValueError(
"Must have equal len keys and value when setting with "
"an iterable"
)
if hasattr(item, "columns"):
- if not all(idx in item.columns for idx in col_lookup):
+ column_values = self.qc.columns[col_lookup]
+ if not all(col in item.columns for col in column_values):
+ # TODO: think if it is needed to handle cases when columns have duplicate names
raise ValueError(
"Must have equal len keys and value when setting "
"with an iterable"
)
- item = item.reindex(index=row_lookup, columns=col_lookup)
+ item = item.reindex(index=index_values, columns=column_values)
else:
- item = item.reindex(index=row_lookup)
+ item = item.reindex(index=index_values)
try:
item = np.array(item)
if np.prod(to_shape) == np.prod(item.shape):
@@ -417,6 +437,57 @@ def _write_items(self, row_lookup, col_lookup, item):
new_qc = self.qc.write_items(row_lookup, col_lookup, item)
self.df._create_or_update_from_compiler(new_qc, inplace=True)
+ def _determine_setitem_axis(self, row_lookup, col_lookup, row_scaler, col_scaler):
+ """
+ Determine an axis along which we should do an assignment.
+
+ Parameters
+ ----------
+ row_lookup: slice or list
+ Indexer for rows
+ col_lookup: slice or list
+ Indexer for columns
+ row_scaler: bool
+ Whether indexer for rows was slacar or not
+ col_scaler: bool
+ Whether indexer for columns was slacer or not
+
+ Returns
+ -------
+ int or None
+ None if this will be a both axis assignment, number of axis to assign in other cases.
+
+ Notes
+ -----
+ axis = 0: column assignment df[col] = item
+ axis = 1: row assignment df.loc[row] = item
+ axis = None: assignment along both axes
+ """
+ if self.df.shape == (1, 1):
+ return None if not (row_scaler ^ col_scaler) else 1 if row_scaler else 0
+
+ def get_axis(axis):
+ return self.qc.index if axis == 0 else self.qc.columns
+
+ row_lookup_len, col_lookup_len = [
+ len(lookup)
+ if not isinstance(lookup, slice)
+ else compute_sliced_len(lookup, len(get_axis(i)))
+ for i, lookup in enumerate([row_lookup, col_lookup])
+ ]
+
+ if (
+ row_lookup_len == len(self.qc.index)
+ and col_lookup_len == 1
+ and isinstance(self.df, DataFrame)
+ ):
+ axis = 0
+ elif col_lookup_len == len(self.qc.columns) and row_lookup_len == 1:
+ axis = 1
+ else:
+ axis = None
+ return axis
+
class _LocIndexer(_LocationIndexerBase):
"""An indexer for modin_df.loc[] functionality."""
@@ -474,7 +545,7 @@ def __getitem__(self, key):
)
):
result.index = result.index.droplevel(list(range(len(col_loc))))
- elif all(
+ elif not isinstance(row_loc, slice) and all(
not isinstance(row_loc[i], slice)
and row_loc[i] in result.index.levels[i]
for i in range(len(row_loc))
@@ -487,6 +558,15 @@ def __getitem__(self, key):
and all(col_loc[i] in result.columns.levels[i] for i in range(len(col_loc)))
):
result.columns = result.columns.droplevel(list(range(len(col_loc))))
+ # This is done for cases where the index passed in has other state, like a
+ # frequency in the case of DateTimeIndex.
+ if (
+ row_lookup is not None
+ and isinstance(col_loc, slice)
+ and col_loc == slice(None)
+ and isinstance(key, pandas.Index)
+ ):
+ result.index = key
return result
def __setitem__(self, key, item):
@@ -507,7 +587,7 @@ def __setitem__(self, key, item):
-------
What this returns (if anything)
"""
- row_loc, col_loc, _, __, ___ = _parse_tuple(key)
+ row_loc, col_loc, _, row_scaler, col_scaler = _parse_tuple(key)
if isinstance(row_loc, list) and len(row_loc) == 1:
if row_loc[0] not in self.qc.index:
index = self.qc.index.insert(len(self.qc.index), row_loc[0])
@@ -525,7 +605,14 @@ def __setitem__(self, key, item):
self.qc = self.df._query_compiler
else:
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
- super(_LocIndexer, self).__setitem__(row_lookup, col_lookup, item)
+ super(_LocIndexer, self).__setitem__(
+ row_lookup,
+ col_lookup,
+ item,
+ axis=self._determine_setitem_axis(
+ row_lookup, col_lookup, row_scaler, col_scaler
+ ),
+ )
def _compute_enlarge_labels(self, locator, base_index):
"""
@@ -663,12 +750,19 @@ def __setitem__(self, key, item):
-------
What this returns (if anything)
"""
- row_loc, col_loc, _, __, ___ = _parse_tuple(key)
+ row_loc, col_loc, _, row_scaler, col_scaler = _parse_tuple(key)
self._check_dtypes(row_loc)
self._check_dtypes(col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
- super(_iLocIndexer, self).__setitem__(row_lookup, col_lookup, item)
+ super(_iLocIndexer, self).__setitem__(
+ row_lookup,
+ col_lookup,
+ item,
+ axis=self._determine_setitem_axis(
+ row_lookup, col_lookup, row_scaler, col_scaler
+ ),
+ )
def _compute_lookup(self, row_loc, col_loc):
"""
diff --git a/modin/pandas/io.py b/modin/pandas/io.py
index b2349769be9..64a8c4295de 100644
--- a/modin/pandas/io.py
+++ b/modin/pandas/io.py
@@ -23,92 +23,24 @@
"""
import inspect
+import pickle
import pandas
+import pandas._libs.lib as lib
import pathlib
import re
from collections import OrderedDict
+from pandas._typing import FilePathOrBuffer, StorageOptions
from typing import Union, IO, AnyStr, Sequence, Dict, List, Optional, Any
-from pandas._typing import FilePathOrBuffer
from modin.error_message import ErrorMessage
from .dataframe import DataFrame
-from modin.utils import _inherit_func_docstring, _inherit_docstrings
+from modin.utils import _inherit_func_docstring, _inherit_docstrings, Engine
+from . import _update_engine
PQ_INDEX_REGEX = re.compile(r"__index_level_\d+__")
# CSV and table
-def _make_parser_func(sep):
- """
- Create a parser function from the given sep.
-
- Parameters
- ----------
- sep: str
- The separator default to use for the parser.
-
- Returns
- -------
- A function object.
- """
-
- def parser_func(
- filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]],
- sep=sep,
- delimiter=None,
- header="infer",
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- prefix=None,
- mangle_dupe_cols=True,
- dtype=None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace=False,
- skiprows=None,
- nrows=None,
- na_values=None,
- keep_default_na=True,
- na_filter=True,
- verbose=False,
- skip_blank_lines=True,
- parse_dates=False,
- infer_datetime_format=False,
- keep_date_col=False,
- date_parser=None,
- dayfirst=False,
- cache_dates=True,
- iterator=False,
- chunksize=None,
- compression="infer",
- thousands=None,
- decimal: str = ".",
- lineterminator=None,
- quotechar='"',
- quoting=0,
- escapechar=None,
- comment=None,
- encoding=None,
- dialect=None,
- error_bad_lines=True,
- warn_bad_lines=True,
- skipfooter=0,
- doublequote=True,
- delim_whitespace=False,
- low_memory=True,
- memory_map=False,
- float_precision=None,
- ):
- _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
- if kwargs.get("sep", sep) is False:
- kwargs["sep"] = "\t"
- return _read(**kwargs)
-
- return parser_func
def _read(**kwargs):
@@ -124,6 +56,7 @@ def _read(**kwargs):
"""
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
pd_obj = EngineDispatcher.read_csv(**kwargs)
# This happens when `read_csv` returns a TextFileReader object for iterating through
if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
@@ -135,17 +68,153 @@ def _read(**kwargs):
return DataFrame(query_compiler=pd_obj)
-read_table = _inherit_func_docstring(pandas.read_table)(_make_parser_func(sep="\t"))
-read_csv = _inherit_func_docstring(pandas.read_csv)(_make_parser_func(sep=","))
+@_inherit_func_docstring(pandas.read_csv)
+def read_csv(
+ filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]],
+ sep=lib.no_default,
+ delimiter=None,
+ header="infer",
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ prefix=None,
+ mangle_dupe_cols=True,
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skipinitialspace=False,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ keep_default_na=True,
+ na_filter=True,
+ verbose=False,
+ skip_blank_lines=True,
+ parse_dates=False,
+ infer_datetime_format=False,
+ keep_date_col=False,
+ date_parser=None,
+ dayfirst=False,
+ cache_dates=True,
+ iterator=False,
+ chunksize=None,
+ compression="infer",
+ thousands=None,
+ decimal: str = ".",
+ lineterminator=None,
+ quotechar='"',
+ quoting=0,
+ escapechar=None,
+ comment=None,
+ encoding=None,
+ dialect=None,
+ error_bad_lines=True,
+ warn_bad_lines=True,
+ skipfooter=0,
+ doublequote=True,
+ delim_whitespace=False,
+ low_memory=True,
+ memory_map=False,
+ float_precision=None,
+ storage_options: StorageOptions = None,
+):
+ # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args
+ _pd_read_csv_signature = {
+ val.name for val in inspect.signature(pandas.read_csv).parameters.values()
+ }
+ _, _, _, f_locals = inspect.getargvalues(inspect.currentframe())
+ if f_locals.get("sep", sep) is lib.no_default:
+ f_locals["sep"] = ","
+ elif f_locals.get("sep", sep) is False:
+ f_locals["sep"] = "\t"
+ kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature}
+ return _read(**kwargs)
+
+
+@_inherit_func_docstring(pandas.read_table)
+def read_table(
+ filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]],
+ sep=lib.no_default,
+ delimiter=None,
+ header="infer",
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ prefix=None,
+ mangle_dupe_cols=True,
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skipinitialspace=False,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ keep_default_na=True,
+ na_filter=True,
+ verbose=False,
+ skip_blank_lines=True,
+ parse_dates=False,
+ infer_datetime_format=False,
+ keep_date_col=False,
+ date_parser=None,
+ dayfirst=False,
+ cache_dates=True,
+ iterator=False,
+ chunksize=None,
+ compression="infer",
+ thousands=None,
+ decimal: str = ".",
+ lineterminator=None,
+ quotechar='"',
+ quoting=0,
+ escapechar=None,
+ comment=None,
+ encoding=None,
+ dialect=None,
+ error_bad_lines=True,
+ warn_bad_lines=True,
+ skipfooter=0,
+ doublequote=True,
+ delim_whitespace=False,
+ low_memory=True,
+ memory_map=False,
+ float_precision=None,
+):
+ # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args
+ _pd_read_csv_signature = {
+ val.name for val in inspect.signature(pandas.read_csv).parameters.values()
+ }
+ _, _, _, f_locals = inspect.getargvalues(inspect.currentframe())
+ if f_locals.get("sep", sep) is False or f_locals.get("sep", sep) is lib.no_default:
+ f_locals["sep"] = "\t"
+ kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature}
+ return _read(**kwargs)
@_inherit_func_docstring(pandas.read_parquet)
-def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
+def read_parquet(
+ path,
+ engine: str = "auto",
+ columns=None,
+ use_nullable_dtypes: bool = False,
+ **kwargs,
+):
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(
query_compiler=EngineDispatcher.read_parquet(
- path=path, columns=columns, engine=engine, **kwargs
+ path=path,
+ columns=columns,
+ engine=engine,
+ use_nullable_dtypes=use_nullable_dtypes,
+ **kwargs,
)
)
@@ -167,11 +236,13 @@ def read_json(
chunksize=None,
compression="infer",
nrows: Optional[int] = None,
+ storage_options: StorageOptions = None,
):
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_json(**kwargs))
@@ -188,8 +259,6 @@ def read_gbq(
configuration: Optional[Dict[str, Any]] = None,
credentials=None,
use_bqstorage_api: Optional[bool] = None,
- private_key=None,
- verbose=None,
progress_bar_type: Optional[str] = None,
max_results: Optional[int] = None,
) -> DataFrame:
@@ -198,6 +267,7 @@ def read_gbq(
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_gbq(**kwargs))
@@ -223,6 +293,7 @@ def read_html(
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_html(**kwargs))
@@ -233,6 +304,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_clipboard(**kwargs))
@@ -254,6 +326,7 @@ def read_excel(
nrows=None,
na_values=None,
keep_default_na=True,
+ na_filter=True,
verbose=False,
parse_dates=False,
date_parser=None,
@@ -262,12 +335,13 @@ def read_excel(
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
- na_filter=True,
+ storage_options: StorageOptions = None,
):
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
intermediate = EngineDispatcher.read_excel(**kwargs)
if isinstance(intermediate, (OrderedDict, dict)):
parsed = type(intermediate)()
@@ -297,15 +371,22 @@ def read_hdf(
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_hdf(**kwargs))
@_inherit_func_docstring(pandas.read_feather)
-def read_feather(path, columns=None, use_threads: bool = True):
+def read_feather(
+ path,
+ columns=None,
+ use_threads: bool = True,
+ storage_options: StorageOptions = None,
+):
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_feather(**kwargs))
@@ -321,11 +402,13 @@ def read_stata(
order_categoricals=True,
chunksize=None,
iterator=False,
+ storage_options: StorageOptions = None,
):
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_stata(**kwargs))
@@ -342,17 +425,21 @@ def read_sas(
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_sas(**kwargs))
@_inherit_func_docstring(pandas.read_pickle)
def read_pickle(
- filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer"
+ filepath_or_buffer: FilePathOrBuffer,
+ compression: Optional[str] = "infer",
+ storage_options: StorageOptions = None,
):
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_pickle(**kwargs))
@@ -371,6 +458,7 @@ def read_sql(
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
if kwargs.get("chunksize") is not None:
ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
df_gen = pandas.read_sql(**kwargs)
@@ -390,6 +478,7 @@ def read_fwf(
):
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
kwargs.update(kwargs.pop("kwds", {}))
pd_obj = EngineDispatcher.read_fwf(**kwargs)
@@ -418,6 +507,7 @@ def read_sql_table(
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_sql_table(**kwargs))
@@ -435,6 +525,7 @@ def read_sql_query(
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(query_compiler=EngineDispatcher.read_sql_query(**kwargs))
@@ -446,6 +537,7 @@ def read_spss(
):
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
return DataFrame(
query_compiler=EngineDispatcher.read_spss(path, usecols, convert_categoricals)
)
@@ -456,10 +548,12 @@ def to_pickle(
obj: Any,
filepath_or_buffer: Union[str, pathlib.Path],
compression: Optional[str] = "infer",
- protocol: int = 4,
+ protocol: int = pickle.HIGHEST_PROTOCOL,
+ storage_options: StorageOptions = None,
):
from modin.data_management.factories.dispatcher import EngineDispatcher
+ Engine.subscribe(_update_engine)
if isinstance(obj, DataFrame):
obj = obj._query_compiler
return EngineDispatcher.to_pickle(
@@ -479,6 +573,7 @@ def json_normalize(
max_level: Optional[int] = None,
) -> DataFrame:
ErrorMessage.default_to_pandas("json_normalize")
+ Engine.subscribe(_update_engine)
return DataFrame(
pandas.json_normalize(
data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level
@@ -491,6 +586,7 @@ def read_orc(
path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs
) -> DataFrame:
ErrorMessage.default_to_pandas("read_orc")
+ Engine.subscribe(_update_engine)
return DataFrame(pandas.read_orc(path, columns, **kwargs))
diff --git a/modin/pandas/iterator.py b/modin/pandas/iterator.py
index 45233d65c31..acff45fad21 100644
--- a/modin/pandas/iterator.py
+++ b/modin/pandas/iterator.py
@@ -13,7 +13,7 @@
"""Place to define the Modin iterator."""
-from collections import Iterator
+from collections.abc import Iterator
class PartitionIterator(Iterator):
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
index 3300130188e..8d0c8dc07e2 100644
--- a/modin/pandas/series.py
+++ b/modin/pandas/series.py
@@ -36,11 +36,13 @@
from typing import Union, Optional
import warnings
-from modin.utils import _inherit_docstrings, to_pandas
+from modin.utils import _inherit_docstrings, to_pandas, Engine
from modin.config import IsExperimental
from .base import BasePandasDataset, _ATTRS_NO_LOOKUP
from .iterator import PartitionIterator
from .utils import from_pandas, is_scalar
+from .accessor import CachedAccessor, SparseAccessor
+from . import _update_engine
@_inherit_docstrings(pandas.Series, excluded=[pandas.Series.__init__])
@@ -76,6 +78,7 @@ def __init__(
query_compiler: query_compiler
A query compiler object to create the Series from.
"""
+ Engine.subscribe(_update_engine)
if isinstance(data, type(self)):
query_compiler = data._query_compiler.copy()
if index is not None:
@@ -139,9 +142,17 @@ def __radd__(self, left):
return self.add(left)
def __and__(self, other):
+ if isinstance(other, (list, np.ndarray, pandas.Series)):
+ return self._default_to_pandas(pandas.Series.__and__, other)
new_self, new_other = self._prepare_inter_op(other)
return super(Series, new_self).__and__(new_other)
+ def __rand__(self, other):
+ if isinstance(other, (list, np.ndarray, pandas.Series)):
+ return self._default_to_pandas(pandas.Series.__rand__, other)
+ new_self, new_other = self._prepare_inter_op(other)
+ return super(Series, new_self).__rand__(new_other)
+
def __array__(self, dtype=None):
return super(Series, self).__array__(dtype).flatten()
@@ -166,12 +177,6 @@ def __delitem__(self, key):
raise KeyError(key)
self.drop(labels=key, inplace=True)
- def __div__(self, right):
- return self.div(right)
-
- def __rdiv__(self, left):
- return self.rdiv(left)
-
def __divmod__(self, right):
return self.divmod(right)
@@ -214,9 +219,29 @@ def __rmul__(self, left):
return self.rmul(left)
def __or__(self, other):
+ if isinstance(other, (list, np.ndarray, pandas.Series)):
+ return self._default_to_pandas(pandas.Series.__or__, other)
new_self, new_other = self._prepare_inter_op(other)
return super(Series, new_self).__or__(new_other)
+ def __ror__(self, other):
+ if isinstance(other, (list, np.ndarray, pandas.Series)):
+ return self._default_to_pandas(pandas.Series.__ror__, other)
+ new_self, new_other = self._prepare_inter_op(other)
+ return super(Series, new_self).__ror__(new_other)
+
+ def __xor__(self, other):
+ if isinstance(other, (list, np.ndarray, pandas.Series)):
+ return self._default_to_pandas(pandas.Series.__xor__, other)
+ new_self, new_other = self._prepare_inter_op(other)
+ return super(Series, new_self).__xor__(new_other)
+
+ def __rxor__(self, other):
+ if isinstance(other, (list, np.ndarray, pandas.Series)):
+ return self._default_to_pandas(pandas.Series.__rxor__, other)
+ new_self, new_other = self._prepare_inter_op(other)
+ return super(Series, new_self).__rxor__(new_other)
+
def __pow__(self, right):
return self.pow(right)
@@ -260,11 +285,10 @@ def __round__(self, decimals=0):
)
def __setitem__(self, key, value):
- if key not in self.keys():
- raise KeyError(key)
- self._create_or_update_from_compiler(
- self._query_compiler.setitem(1, key, value), inplace=True
- )
+ if isinstance(key, slice):
+ self._setitem_slice(key, value)
+ else:
+ self.loc[key] = value
def __sub__(self, right):
return self.sub(right)
@@ -288,10 +312,6 @@ def __rtruediv__(self, left):
def values(self):
return super(Series, self).to_numpy().flatten()
- def __xor__(self, other):
- new_self, new_other = self._prepare_inter_op(other)
- return super(Series, new_self).__xor__(new_other)
-
def add(self, other, level=None, fill_value=None, axis=0):
new_self, new_other = self._prepare_inter_op(other)
return super(Series, new_self).add(
@@ -479,13 +499,21 @@ def compare(
keep_shape: bool = False,
keep_equal: bool = False,
):
- return self._default_to_pandas(
- pandas.Series.compare,
- other=other,
+ if not isinstance(other, Series):
+ raise TypeError(f"Cannot compare Series to {type(other)}")
+ result = self.to_frame().compare(
+ other.to_frame(),
align_axis=align_axis,
keep_shape=keep_shape,
keep_equal=keep_equal,
)
+ if align_axis == "columns" or align_axis == 1:
+ # Pandas.DataFrame.Compare returns a dataframe with a multidimensional index object as the
+ # columns so we have to change column object back.
+ result.columns = pandas.Index(["self", "other"])
+ else:
+ result = result.squeeze().rename(None)
+ return result
def corr(self, other, method="pearson", min_periods=None):
if method == "pearson":
@@ -817,30 +845,6 @@ def arg(s):
)
)
- def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.median(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.median(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
def memory_usage(self, index=True, deep=False):
if index:
result = self._reduce_dimension(
@@ -877,34 +881,6 @@ def nlargest(self, n=5, keep="first"):
def nsmallest(self, n=5, keep="first"):
return Series(query_compiler=self._query_compiler.nsmallest(n=n, keep=keep))
- def sem(
- self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
- ):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.sem(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.sem(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
def slice_shift(self, periods=1, axis=0):
if periods == 0:
return self.copy()
@@ -929,6 +905,11 @@ def slice_shift(self, periods=1, axis=0):
)
)
+ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ return super(type(self), self).shift(
+ periods=periods, freq=freq, axis=axis, fill_value=fill_value
+ )
+
def unstack(self, level=-1, fill_value=None):
from .dataframe import DataFrame
@@ -938,58 +919,6 @@ def unstack(self, level=-1, fill_value=None):
return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result
- def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.skew(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.skew(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
- def std(
- self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
- ):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.std(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.std(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
@property
def plot(
self,
@@ -1036,22 +965,21 @@ def prod(
**kwargs,
):
axis = self._get_axis_number(axis)
+ if level is not None:
+ return self._default_to_pandas(
+ "prod",
+ axis=axis,
+ skipna=skipna,
+ level=level,
+ numeric_only=numeric_only,
+ min_count=min_count,
+ **kwargs,
+ )
new_index = self.columns if axis else self.index
if min_count > len(new_index):
return np.nan
data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
- if level is not None:
- return data.__constructor__(
- query_compiler=data._query_compiler.prod_min_count(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- min_count=min_count,
- **kwargs,
- )
- )
if min_count > 1:
return data._reduce_dimension(
data._query_compiler.prod_min_count(
@@ -1291,9 +1219,7 @@ def sort_values(
result._query_compiler, inplace=inplace
)
- @property
- def sparse(self):
- return self._default_to_pandas(pandas.Series.sparse)
+ sparse = CachedAccessor("sparse", SparseAccessor)
def squeeze(self, axis=None):
if axis is not None:
@@ -1322,6 +1248,17 @@ def sum(
**kwargs,
):
axis = self._get_axis_number(axis)
+ if level is not None:
+ return self._default_to_pandas(
+ "sum",
+ axis=axis,
+ skipna=skipna,
+ level=level,
+ numeric_only=numeric_only,
+ min_count=min_count,
+ **kwargs,
+ )
+
new_index = self.columns if axis else self.index
if min_count > len(new_index):
return np.nan
@@ -1329,17 +1266,6 @@ def sum(
data = self._validate_dtypes_sum_prod_mean(
axis, numeric_only, ignore_axis=False
)
- if level is not None:
- return data.__constructor__(
- query_compiler=data._query_compiler.sum_min_count(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- min_count=min_count,
- **kwargs,
- )
- )
if min_count > 1:
return data._reduce_dimension(
data._query_compiler.sum_min_count(
@@ -1450,7 +1376,9 @@ def truncate(self, before=None, after=None, axis=None, copy=True):
)
def unique(self):
- return self._query_compiler.unique().to_numpy().squeeze()
+ return self.__constructor__(
+ query_compiler=self._query_compiler.unique()
+ ).to_numpy()
def update(self, other):
if not isinstance(other, Series):
@@ -1471,34 +1399,6 @@ def value_counts(
)
)
- def var(
- self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
- ):
- axis = self._get_axis_number(axis)
- if numeric_only is not None and not numeric_only:
- self._validate_dtypes(numeric_only=True)
- if level is not None:
- return self.__constructor__(
- query_compiler=self._query_compiler.var(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
- return self._reduce_dimension(
- self._query_compiler.var(
- axis=axis,
- skipna=skipna,
- level=level,
- ddof=ddof,
- numeric_only=numeric_only,
- **kwargs,
- )
- )
-
def view(self, dtype=None):
return self.__constructor__(
query_compiler=self._query_compiler.series_view(dtype=dtype)
@@ -1576,7 +1476,7 @@ def hasnans(self):
@property
def is_monotonic(self):
- return self._reduce_dimension(self._query_compiler.is_monotonic())
+ return self._reduce_dimension(self._query_compiler.is_monotonic_increasing())
is_monotonic_increasing = is_monotonic
@@ -1689,6 +1589,11 @@ def _validate_dtypes_min_max(self, axis, numeric_only):
def _validate_dtypes(self, numeric_only=False):
pass
+ def _get_numeric_data(self, axis: int):
+ # `numeric_only` parameter does not supported by Series, so this method
+ # doesn't do anything
+ return self
+
def _update_inplace(self, new_query_compiler):
"""
Implement [METHOD_NAME].
@@ -1764,9 +1669,11 @@ def _prepare_inter_op(self, other):
"""
if isinstance(other, Series):
new_self = self.copy()
- new_self.name = "__reduced__"
new_other = other.copy()
- new_other.name = "__reduced__"
+ if self.name == other.name:
+ new_self.name = new_other.name = self.name
+ else:
+ new_self.name = new_other.name = "__reduced__"
else:
new_self = self
new_other = other
diff --git a/modin/pandas/test/conftest.py b/modin/pandas/test/conftest.py
deleted file mode 100644
index 2cc83a8e068..00000000000
--- a/modin/pandas/test/conftest.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Licensed to Modin Development Team under one or more contributor license agreements.
-# See the NOTICE file distributed with this work for additional information regarding
-# copyright ownership. The Modin Development Team licenses this file to you under the
-# Apache License, Version 2.0 (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under
-# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
-# ANY KIND, either express or implied. See the License for the specific language
-# governing permissions and limitations under the License.
-
-import modin
-
-from modin.backends import PandasQueryCompiler, BaseQueryCompiler
-from modin.engines.python.pandas_on_python.io import PandasOnPythonIO
-from modin.data_management.factories import factories
-from modin.utils import get_current_backend
-
-import pytest
-
-BASE_BACKEND_NAME = "BaseOnPython"
-
-
-class TestQC(BaseQueryCompiler):
- def __init__(self, modin_frame):
- self._modin_frame = modin_frame
-
- @classmethod
- def from_pandas(cls, df, data_cls):
- return cls(data_cls.from_pandas(df))
-
- @classmethod
- def from_arrow(cls, at, data_cls):
- return cls(data_cls.from_arrow(at))
-
- def free(self):
- pass
-
- to_pandas = PandasQueryCompiler.to_pandas
- default_to_pandas = PandasQueryCompiler.default_to_pandas
-
-
-class BaseOnPythonIO(PandasOnPythonIO):
- query_compiler_cls = TestQC
-
-
-class BaseOnPythonFactory(factories.BaseFactory):
- @classmethod
- def prepare(cls):
- cls.io_cls = BaseOnPythonIO
-
-
-def set_base_backend(name=BASE_BACKEND_NAME):
- setattr(factories, f"{name}Factory", BaseOnPythonFactory)
- modin.set_backends(engine="python", partition=name.split("On")[0])
-
-
-def pytest_addoption(parser):
- parser.addoption("--backend", action="store", default=None)
-
-
-def pytest_configure(config):
- backend = config.option.backend
-
- if backend is None:
- return
-
- if backend == BASE_BACKEND_NAME:
- set_base_backend(BASE_BACKEND_NAME)
- else:
- partition, engine = backend.split("On")
- modin.set_base_backend(engine=engine, partition=backend)
-
-
-def pytest_runtest_call(item):
- custom_markers = ["xfail", "skip"]
-
- # dynamicly adding custom markers to tests
- for custom_marker in custom_markers:
- for marker in item.iter_markers(name=f"{custom_marker}_backends"):
- backends = marker.args[0]
- if not isinstance(backends, list):
- backends = [backends]
-
- current_backend = get_current_backend()
- reason = marker.kwargs.pop("reason", "")
-
- item.add_marker(
- getattr(pytest.mark, custom_marker)(
- condition=current_backend in backends,
- reason=f"Backend {current_backend} does not pass this test. {reason}",
- **marker.kwargs,
- )
- )
diff --git a/modin/pandas/test/data/excel_sheetname_title.xlsx b/modin/pandas/test/data/excel_sheetname_title.xlsx
new file mode 100644
index 00000000000..4484f5f0f4e
Binary files /dev/null and b/modin/pandas/test/data/excel_sheetname_title.xlsx differ
diff --git a/modin/pandas/test/data/issue_2239.csv b/modin/pandas/test/data/issue_2239.csv
new file mode 100644
index 00000000000..f70cacd0f98
--- /dev/null
+++ b/modin/pandas/test/data/issue_2239.csv
@@ -0,0 +1,146 @@
+1585542839.000000, 1585542839.000000, 1585542839.000000
+32.000000, 32.000000, 32.000000
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-13,51
+-38,-14,51
+-38,-14,50
+-38,-13,51
+-38,-14,50
+-38,-14,51
+-38,-13,51
diff --git a/modin/pandas/test/data/issue_621.csv b/modin/pandas/test/data/issue_621.csv
deleted file mode 100644
index c0d924616ef..00000000000
--- a/modin/pandas/test/data/issue_621.csv
+++ /dev/null
@@ -1,10 +0,0 @@
-ins_74901673,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,j_217,10,Terminated,673795,673797,m_2637,1,1,13,16,0.02,0.02
-ins_815802872,M1,j_1527,1,Terminated,158478,158520,m_3430,1,1,3,19,0.13,0.18
-ins_564677701,M1,j_2014,1,Terminated,372602,372616,m_1910,1,1,87,116,0.04,0.05
-ins_257566161,M1,j_2014,1,Terminated,372602,372615,m_2485,1,1,91,123,0.05,0.05
-ins_688679908,M1,j_2014,1,Terminated,372602,372615,m_993,1,1,93,141,0.05,0.05
-ins_929638393,M1,j_2014,1,Terminated,372603,372615,m_2808,1,1,100,137,0.05,0.05
-ins_1349024140,M1,j_2014,1,Terminated,372603,372617,m_3736,1,1,82,111,0.05,0.05
-ins_330247444,M1,j_2014,1,Terminated,372603,372617,m_1176,1,1,84,110,0.05,0.05
-ins_833551291,M1,j_2014,1,Terminated,372602,372614,m_2682,1,1,90,159,0.05,0.05
-ins_833550789,M1,j_2014,1,Terminated,372603,372619,m_3625,1,1,78,105,0.05,0.05
diff --git a/modin/pandas/test/data/issue_976.csv b/modin/pandas/test/data/issue_976.csv
new file mode 100644
index 00000000000..48a97102af0
--- /dev/null
+++ b/modin/pandas/test/data/issue_976.csv
@@ -0,0 +1,5 @@
+1;11800000560005;11800000560005;������� ����� ����������;;-;���. ����;�. �������i���;������������� �����;����������� �������;105.6000
+1;10200007400477;10200007400477;�������� ����� ����������;;-;���. ������;����������;³����������� �����;����������� �������;696.6400
+1;11100008540930;11100008540930;���������� ������� ��������;2;9;���. ������;�.�������;����������� �����;����������� �������;124.4800
+1;12300000051493;12300000051493;���������� ����� ����������;;50;���. ����������;��.���������;���'�����-���������� �����;����������� �������;-0.4700
+1;12300000117460;12300000117460;����� ³���� ���������;;60;���. ���������;������;���'�����-���������� �����;����������� �������;221.0400
diff --git a/modin/pandas/test/data/test_emptyline.xlsx b/modin/pandas/test/data/test_emptyline.xlsx
new file mode 100644
index 00000000000..b7f12ca86c0
Binary files /dev/null and b/modin/pandas/test/data/test_emptyline.xlsx differ
diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py
index 20b14faecd5..452682b7fb2 100644
--- a/modin/pandas/test/dataframe/test_binary.py
+++ b/modin/pandas/test/dataframe/test_binary.py
@@ -24,8 +24,9 @@
test_data,
create_test_dfs,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
@@ -115,8 +116,6 @@ def test_math_functions_level(op):
("sub", "subtract"),
("add", "__add__"),
("radd", "__radd__"),
- ("div", "__div__"),
- ("rdiv", "__rdiv__"),
("truediv", "__truediv__"),
("rtruediv", "__rtruediv__"),
("floordiv", "__floordiv__"),
@@ -135,7 +134,7 @@ def test_math_alias(math_op, alias):
assert getattr(pd.DataFrame, math_op) == getattr(pd.DataFrame, alias)
-@pytest.mark.parametrize("other", ["as_left", 4, 4.0])
+@pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"])
@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_comparison(data, op, other):
@@ -145,20 +144,6 @@ def test_comparison(data, op, other):
)
-@pytest.mark.xfail_backends(
- ["BaseOnPython"],
- reason="Test is failing because of mismathing of thrown exceptions. See pandas issue #36377",
-)
-@pytest.mark.parametrize("other", ["a"])
-@pytest.mark.parametrize("op", ["ge", "gt", "le", "lt", "eq", "ne"])
-@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
-def test_comparison_except(data, op, other):
- eval_general(
- *create_test_dfs(data),
- lambda df: getattr(df, op)(other),
- )
-
-
@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_multi_level_comparison(data, op):
@@ -197,3 +182,62 @@ def test_equals():
df_equals(modin_df3, modin_df2)
assert modin_df1.equals(modin_df2._query_compiler.to_pandas())
+
+
+@pytest.mark.parametrize("is_more_other_partitions", [True, False])
+@pytest.mark.parametrize(
+ "op_type", ["df_ser", "df_df", "ser_ser_same_name", "ser_ser_different_name"]
+)
+@pytest.mark.parametrize(
+ "is_idx_aligned", [True, False], ids=["idx_aligned", "idx_not_aligned"]
+)
+def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partitions):
+ data = [0, 1, 2, 3, 4, 5]
+ modin_df1, pandas_df1 = create_test_dfs({"a": data, "b": data})
+ modin_df, pandas_df = modin_df1.loc[:2], pandas_df1.loc[:2]
+
+ modin_df2 = modin_df.append(modin_df)
+ pandas_df2 = pandas_df.append(pandas_df)
+ if is_more_other_partitions:
+ modin_df2, modin_df1 = modin_df1, modin_df2
+ pandas_df2, pandas_df1 = pandas_df1, pandas_df2
+
+ if is_idx_aligned:
+ if is_more_other_partitions:
+ modin_df1.index = pandas_df1.index = pandas_df2.index
+ else:
+ modin_df2.index = pandas_df2.index = pandas_df1.index
+
+ # Pandas don't support this case because result will contain duplicate values by col axis.
+ if op_type == "df_ser" and not is_idx_aligned and is_more_other_partitions:
+ eval_general(
+ modin_df2,
+ pandas_df2,
+ lambda df: df / modin_df1.a
+ if isinstance(df, pd.DataFrame)
+ else df / pandas_df1.a,
+ )
+ return
+
+ if op_type == "df_ser":
+ modin_res = modin_df2 / modin_df1.a
+ pandas_res = pandas_df2 / pandas_df1.a
+ elif op_type == "df_df":
+ modin_res = modin_df2 / modin_df1
+ pandas_res = pandas_df2 / pandas_df1
+ elif op_type == "ser_ser_same_name":
+ modin_res = modin_df2.a / modin_df1.a
+ pandas_res = pandas_df2.a / pandas_df1.a
+ elif op_type == "ser_ser_different_name":
+ modin_res = modin_df2.a / modin_df1.b
+ pandas_res = pandas_df2.a / pandas_df1.b
+ df_equals(modin_res, pandas_res)
+
+
+def test_duplicate_indexes():
+ data = [0, 1, 2, 3, 4, 5]
+ modin_df1, pandas_df1 = create_test_dfs(
+ {"a": data, "b": data}, index=[0, 1, 2, 0, 1, 2]
+ )
+ modin_df2, pandas_df2 = create_test_dfs({"a": data, "b": data})
+ df_equals(modin_df1 / modin_df2, pandas_df1 / pandas_df2)
diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py
index b8b39c203da..686c3298f8d 100644
--- a/modin/pandas/test/dataframe/test_default.py
+++ b/modin/pandas/test/dataframe/test_default.py
@@ -38,8 +38,9 @@
test_data_diff_dtype,
modin_df_almost_equals_pandas,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
@@ -63,6 +64,8 @@
("pct_change", None),
("__getstate__", None),
("to_xarray", None),
+ ("flags", None),
+ ("set_flags", lambda df: {"allows_duplicate_labels": False}),
],
)
def test_ops_defaulting_to_pandas(op, make_args):
@@ -72,7 +75,11 @@ def test_ops_defaulting_to_pandas(op, make_args):
if make_args is not None:
operation(**make_args(modin_df))
else:
- operation()
+ try:
+ operation()
+ # `except` for non callable attributes
+ except TypeError:
+ pass
def test_style():
@@ -1151,6 +1158,13 @@ def test___bool__(data):
eval_general(*create_test_dfs(data), lambda df: df.__bool__())
-@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
-def test_hasattr_sparse(data):
- eval_general(*create_test_dfs(data), lambda df: hasattr(df, "sparse"))
+@pytest.mark.parametrize(
+ "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"]
+)
+def test_hasattr_sparse(is_sparse_data):
+ modin_df, pandas_df = (
+ create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values()))
+ if is_sparse_data
+ else create_test_dfs(test_data["float_nan_data"])
+ )
+ eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))
diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
index fe2128b8782..1f94f0ba200 100644
--- a/modin/pandas/test/dataframe/test_indexing.py
+++ b/modin/pandas/test/dataframe/test_indexing.py
@@ -14,7 +14,7 @@
import pytest
import numpy as np
import pandas
-import pandas.util.testing as tm
+from pandas.testing import assert_index_equal
import matplotlib
import modin.pandas as pd
import sys
@@ -34,14 +34,27 @@
int_arg_keys,
int_arg_values,
create_test_dfs,
+ eval_general,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
+def eval_setitem(md_df, pd_df, value, col=None, loc=None):
+ if loc is not None:
+ col = pd_df.columns[loc]
+
+ value_getter = value if callable(value) else (lambda *args, **kwargs: value)
+
+ eval_general(
+ md_df, pd_df, lambda df: df.__setitem__(col, value_getter(df)), __inplace__=True
+ )
+
+
@pytest.mark.parametrize(
"dates",
[
@@ -387,12 +400,9 @@ def test_loc_multi_index():
df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]])
-@pytest.mark.parametrize("index", [["row1", "row2", "row3"], ["row1"]])
-@pytest.mark.parametrize("columns", [["col1", "col2"], ["col1"]])
+@pytest.mark.parametrize("index", [["row1", "row2", "row3"]])
+@pytest.mark.parametrize("columns", [["col1", "col2"]])
def test_loc_assignment(index, columns):
- if len(index) == 1 and len(columns) == 1:
- pytest.skip("See Modin issue #2253 for details")
-
md_df, pd_df = create_test_dfs(index=index, columns=columns)
for i, ind in enumerate(index):
for j, col in enumerate(columns):
@@ -402,6 +412,48 @@ def test_loc_assignment(index, columns):
df_equals(md_df, pd_df)
+@pytest.fixture
+def loc_iter_dfs():
+ columns = ["col1", "col2", "col3"]
+ index = ["row1", "row2", "row3"]
+ return create_test_dfs(
+ {col: ([idx] * len(index)) for idx, col in enumerate(columns)},
+ columns=columns,
+ index=index,
+ )
+
+
+@pytest.mark.parametrize("reverse_order", [False, True])
+@pytest.mark.parametrize("axis", [0, 1])
+def test_loc_iter_assignment(loc_iter_dfs, reverse_order, axis):
+ if reverse_order and axis:
+ pytest.xfail(
+ "Due to internal sorting of lookup values assignment order is lost, see GH-#2552"
+ )
+
+ md_df, pd_df = loc_iter_dfs
+
+ select = [slice(None), slice(None)]
+ select[axis] = sorted(pd_df.axes[axis][:-1], reverse=reverse_order)
+ select = tuple(select)
+
+ pd_df.loc[select] = pd_df.loc[select] + pd_df.loc[select]
+ md_df.loc[select] = md_df.loc[select] + md_df.loc[select]
+ df_equals(md_df, pd_df)
+
+
+@pytest.mark.parametrize("reverse_order", [False, True])
+@pytest.mark.parametrize("axis", [0, 1])
+def test_loc_order(loc_iter_dfs, reverse_order, axis):
+ md_df, pd_df = loc_iter_dfs
+
+ select = [slice(None), slice(None)]
+ select[axis] = sorted(pd_df.axes[axis][:-1], reverse=reverse_order)
+ select = tuple(select)
+
+ df_equals(pd_df.loc[select], md_df.loc[select])
+
+
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_loc_nested_assignment(data):
modin_df = pd.DataFrame(data)
@@ -454,6 +506,15 @@ def test_iloc_nested_assignment(data):
df_equals(modin_df, pandas_df)
+def test_loc_series():
+ md_df, pd_df = create_test_dfs({"a": [1, 2], "b": [3, 4]})
+
+ pd_df.loc[pd_df["a"] > 1, "b"] = np.log(pd_df["b"])
+ md_df.loc[md_df["a"] > 1, "b"] = np.log(md_df["b"])
+
+ df_equals(pd_df, md_df)
+
+
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_pop(request, data):
modin_df = pd.DataFrame(data)
@@ -521,13 +582,15 @@ def test_reindex_like():
def test_rename_sanity():
- test_data = pandas.DataFrame(tm.getSeriesData())
- mapping = {"A": "a", "B": "b", "C": "c", "D": "d"}
+ source_df = pandas.DataFrame(test_data["int_data"])[
+ ["col1", "index", "col3", "col4"]
+ ]
+ mapping = {"col1": "a", "index": "b", "col3": "c", "col4": "d"}
- modin_df = pd.DataFrame(test_data)
- df_equals(modin_df.rename(columns=mapping), test_data.rename(columns=mapping))
+ modin_df = pd.DataFrame(source_df)
+ df_equals(modin_df.rename(columns=mapping), source_df.rename(columns=mapping))
- renamed2 = test_data.rename(columns=str.lower)
+ renamed2 = source_df.rename(columns=str.lower)
df_equals(modin_df.rename(columns=str.lower), renamed2)
modin_df = pd.DataFrame(renamed2)
@@ -539,20 +602,20 @@ def test_rename_sanity():
# gets sorted alphabetical
df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)
- tm.assert_index_equal(
+ assert_index_equal(
modin_df.rename(index={"foo": "bar", "bar": "foo"}).index,
df.rename(index={"foo": "bar", "bar": "foo"}).index,
)
- tm.assert_index_equal(
+ assert_index_equal(
modin_df.rename(index=str.upper).index, df.rename(index=str.upper).index
)
# Using the `mapper` functionality with `axis`
- tm.assert_index_equal(
+ assert_index_equal(
modin_df.rename(str.upper, axis=0).index, df.rename(str.upper, axis=0).index
)
- tm.assert_index_equal(
+ assert_index_equal(
modin_df.rename(str.upper, axis=1).columns,
df.rename(str.upper, axis=1).columns,
)
@@ -562,18 +625,18 @@ def test_rename_sanity():
modin_df.rename()
# partial columns
- renamed = test_data.rename(columns={"C": "foo", "D": "bar"})
- modin_df = pd.DataFrame(test_data)
- tm.assert_index_equal(
- modin_df.rename(columns={"C": "foo", "D": "bar"}).index,
- test_data.rename(columns={"C": "foo", "D": "bar"}).index,
+ renamed = source_df.rename(columns={"col3": "foo", "col4": "bar"})
+ modin_df = pd.DataFrame(source_df)
+ assert_index_equal(
+ modin_df.rename(columns={"col3": "foo", "col4": "bar"}).index,
+ source_df.rename(columns={"col3": "foo", "col4": "bar"}).index,
)
# other axis
- renamed = test_data.T.rename(index={"C": "foo", "D": "bar"})
- tm.assert_index_equal(
- test_data.T.rename(index={"C": "foo", "D": "bar"}).index,
- modin_df.T.rename(index={"C": "foo", "D": "bar"}).index,
+ renamed = source_df.T.rename(index={"col3": "foo", "col4": "bar"})
+ assert_index_equal(
+ source_df.T.rename(index={"col3": "foo", "col4": "bar"}).index,
+ modin_df.T.rename(index={"col3": "foo", "col4": "bar"}).index,
)
# index with name
@@ -583,7 +646,7 @@ def test_rename_sanity():
renamed = renamer.rename(index={"foo": "bar", "bar": "foo"})
modin_renamed = modin_df.rename(index={"foo": "bar", "bar": "foo"})
- tm.assert_index_equal(renamed.index, modin_renamed.index)
+ assert_index_equal(renamed.index, modin_renamed.index)
assert renamed.index.name == modin_renamed.index.name
@@ -608,13 +671,13 @@ def test_rename_multiindex():
index={"foo1": "foo3", "bar2": "bar3"},
columns={"fizz1": "fizz3", "buzz2": "buzz3"},
)
- tm.assert_index_equal(renamed.index, modin_renamed.index)
+ assert_index_equal(renamed.index, modin_renamed.index)
renamed = df.rename(
index={"foo1": "foo3", "bar2": "bar3"},
columns={"fizz1": "fizz3", "buzz2": "buzz3"},
)
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
assert renamed.index.names == modin_renamed.index.names
assert renamed.columns.names == modin_renamed.columns.names
@@ -626,68 +689,72 @@ def test_rename_multiindex():
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0
)
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz")
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz"
)
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1)
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1
)
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz")
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz"
)
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
# function
func = str.upper
renamed = df.rename(columns=func, level=0)
modin_renamed = modin_df.rename(columns=func, level=0)
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns=func, level="fizz")
modin_renamed = modin_df.rename(columns=func, level="fizz")
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns=func, level=1)
modin_renamed = modin_df.rename(columns=func, level=1)
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns=func, level="buzz")
modin_renamed = modin_df.rename(columns=func, level="buzz")
- tm.assert_index_equal(renamed.columns, modin_renamed.columns)
+ assert_index_equal(renamed.columns, modin_renamed.columns)
# index
renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
modin_renamed = modin_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
- tm.assert_index_equal(modin_renamed.index, renamed.index)
+ assert_index_equal(modin_renamed.index, renamed.index)
@pytest.mark.skip(reason="Pandas does not pass this test")
def test_rename_nocopy():
- test_data = pandas.DataFrame(tm.getSeriesData())
- modin_df = pd.DataFrame(test_data)
- modin_renamed = modin_df.rename(columns={"C": "foo"}, copy=False)
+ source_df = pandas.DataFrame(test_data["int_data"])[
+ ["col1", "index", "col3", "col4"]
+ ]
+ modin_df = pd.DataFrame(source_df)
+ modin_renamed = modin_df.rename(columns={"col3": "foo"}, copy=False)
modin_renamed["foo"] = 1
- assert (modin_df["C"] == 1).all()
+ assert (modin_df["col3"] == 1).all()
def test_rename_inplace():
- test_data = pandas.DataFrame(tm.getSeriesData())
- modin_df = pd.DataFrame(test_data)
+ source_df = pandas.DataFrame(test_data["int_data"])[
+ ["col1", "index", "col3", "col4"]
+ ]
+ modin_df = pd.DataFrame(source_df)
df_equals(
- modin_df.rename(columns={"C": "foo"}),
- test_data.rename(columns={"C": "foo"}),
+ modin_df.rename(columns={"col3": "foo"}),
+ source_df.rename(columns={"col3": "foo"}),
)
- frame = test_data.copy()
+ frame = source_df.copy()
modin_frame = modin_df.copy()
- frame.rename(columns={"C": "foo"}, inplace=True)
- modin_frame.rename(columns={"C": "foo"}, inplace=True)
+ frame.rename(columns={"col3": "foo"}, inplace=True)
+ modin_frame.rename(columns={"col3": "foo"}, inplace=True)
df_equals(modin_frame, frame)
@@ -752,7 +819,7 @@ def test_rename_axis():
def test_rename_axis_inplace():
- test_frame = pandas.DataFrame(tm.getSeriesData())
+ test_frame = pandas.DataFrame(test_data["int_data"])
modin_df = pd.DataFrame(test_frame)
result = test_frame.copy()
@@ -804,6 +871,42 @@ def test_reorder_levels():
)
+def test_reindex_multiindex():
+ data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6)
+ index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"])
+ modin_midx = pd.MultiIndex.from_product(
+ [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
+ )
+ pandas_midx = pandas.MultiIndex.from_product(
+ [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
+ )
+ modin_df1, modin_df2 = (
+ pd.DataFrame(data=data1, index=index, columns=index),
+ pd.DataFrame(data2, modin_midx),
+ )
+ pandas_df1, pandas_df2 = (
+ pandas.DataFrame(data=data1, index=index, columns=index),
+ pandas.DataFrame(data2, pandas_midx),
+ )
+ modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"]
+ md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index])
+ pd_midx = pandas.MultiIndex.from_product(
+ [pandas_df2.index.levels[0], pandas_df1.index]
+ )
+ # reindex without axis, index, or columns
+ modin_result = modin_df1.reindex(md_midx, fill_value=0)
+ pandas_result = pandas_df1.reindex(pd_midx, fill_value=0)
+ df_equals(modin_result, pandas_result)
+ # reindex with only axis
+ modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0)
+ pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0)
+ df_equals(modin_result, pandas_result)
+ # reindex with axis and level
+ modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0)
+ pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0)
+ df_equals(modin_result, pandas_result)
+
+
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_reset_index(data):
modin_df = pd.DataFrame(data)
@@ -1072,38 +1175,20 @@ def test___getattr__(request, data):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___setitem__(data):
- modin_df = pd.DataFrame(data)
- pandas_df = pandas.DataFrame(data)
-
- modin_df.__setitem__(modin_df.columns[-1], 1)
- pandas_df.__setitem__(pandas_df.columns[-1], 1)
- df_equals(modin_df, pandas_df)
-
- modin_df = pd.DataFrame(data)
- pandas_df = pandas.DataFrame(data)
-
- modin_df[modin_df.columns[-1]] = pd.DataFrame(modin_df[modin_df.columns[0]])
- pandas_df[pandas_df.columns[-1]] = pandas.DataFrame(pandas_df[pandas_df.columns[0]])
- df_equals(modin_df, pandas_df)
-
- modin_df = pd.DataFrame(data)
- pandas_df = pandas.DataFrame(data)
-
- rows = len(modin_df)
- arr = np.arange(rows * 2).reshape(-1, 2)
- modin_df[modin_df.columns[-1]] = arr
- pandas_df[pandas_df.columns[-1]] = arr
- df_equals(pandas_df, modin_df)
+ eval_setitem(*create_test_dfs(data), loc=-1, value=1)
+ eval_setitem(
+ *create_test_dfs(data), loc=-1, value=lambda df: type(df)(df[df.columns[0]])
+ )
- with pytest.raises(ValueError, match=r"Wrong number of items passed"):
- modin_df["___NON EXISTENT COLUMN"] = arr
+ nrows = len(data[list(data.keys())[0]])
+ arr = np.arange(nrows * 2).reshape(-1, 2)
- modin_df[modin_df.columns[0]] = np.arange(len(modin_df))
- pandas_df[pandas_df.columns[0]] = np.arange(len(pandas_df))
- df_equals(modin_df, pandas_df)
+ eval_setitem(*create_test_dfs(data), loc=-1, value=arr)
+ eval_setitem(*create_test_dfs(data), col="___NON EXISTENT COLUMN", value=arr)
+ eval_setitem(*create_test_dfs(data), loc=0, value=np.arange(nrows))
- modin_df = pd.DataFrame(columns=modin_df.columns)
- pandas_df = pandas.DataFrame(columns=pandas_df.columns)
+ modin_df = pd.DataFrame(columns=data.keys())
+ pandas_df = pandas.DataFrame(columns=data.keys())
for col in modin_df.columns:
modin_df[col] = np.arange(1000)
@@ -1130,19 +1215,16 @@ def test___setitem__(data):
df_equals(modin_df, pandas_df)
assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0]))
+ modin_df[1:5] = 10
+ pandas_df[1:5] = 10
+ df_equals(modin_df, pandas_df)
+
# Transpose test
modin_df = pd.DataFrame(data).T
pandas_df = pandas.DataFrame(data).T
- # We default to pandas on non-string column names
- if not all(isinstance(c, str) for c in modin_df.columns):
- with pytest.warns(UserWarning):
- modin_df[modin_df.columns[0]] = 0
- else:
- modin_df[modin_df.columns[0]] = 0
-
+ modin_df[modin_df.columns[0]] = 0
pandas_df[pandas_df.columns[0]] = 0
-
df_equals(modin_df, pandas_df)
modin_df.columns = [str(i) for i in modin_df.columns]
@@ -1155,10 +1237,42 @@ def test___setitem__(data):
modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345
pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345
+ df_equals(modin_df, pandas_df)
+ modin_df[1:5] = 10
+ pandas_df[1:5] = 10
df_equals(modin_df, pandas_df)
+def test___setitem__partitions_aligning():
+ # from issue #2390
+ modin_df = pd.DataFrame({"a": [1, 2, 3]})
+ pandas_df = pandas.DataFrame({"a": [1, 2, 3]})
+ modin_df["b"] = pd.Series([4, 5, 6, 7, 8])
+ pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8])
+ df_equals(modin_df, pandas_df)
+
+ # from issue #2442
+ data = {"a": [1, 2, 3, 4]}
+ # Index with duplicated timestamp
+ index = pandas.to_datetime(["2020-02-06", "2020-02-06", "2020-02-22", "2020-03-26"])
+
+ md_df, pd_df = create_test_dfs(data, index=index)
+ # Setting new column
+ pd_df["b"] = pandas.Series(np.arange(4))
+ md_df["b"] = pd.Series(np.arange(4))
+ df_equals(md_df, pd_df)
+
+ # Setting existing column
+ pd_df["b"] = pandas.Series(np.arange(4))
+ md_df["b"] = pd.Series(np.arange(4))
+ df_equals(md_df, pd_df)
+
+ pd_df["a"] = pandas.Series(np.arange(4))
+ md_df["a"] = pd.Series(np.arange(4))
+ df_equals(md_df, pd_df)
+
+
def test___setitem__with_mismatched_partitions():
fname = "200kx99.csv"
np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",")
@@ -1201,19 +1315,16 @@ def test___setitem__mask():
"data",
[
{},
- pytest.param(
- {"id": [], "max_speed": [], "health": []},
- marks=pytest.mark.xfail(
- reason="Throws an exception because generally assigning Series or other objects of length different from DataFrame does not work right now"
- ),
- ),
+ {"id": [], "max_speed": [], "health": []},
+ {"id": [1], "max_speed": [2], "health": [3]},
+ {"id": [4, 40, 400], "max_speed": [111, 222, 333], "health": [33, 22, 11]},
],
- ids=["empty", "empty_columns"],
+ ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"],
)
@pytest.mark.parametrize(
"value",
- [np.array(["one", "two"]), [11, 22]],
- ids=["ndarray", "list"],
+ [[11, 22], [11, 22, 33]],
+ ids=["2_length_val", "3_length_val"],
)
@pytest.mark.parametrize("convert_to_series", [False, True])
@pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"])
@@ -1221,9 +1332,19 @@ def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id):
pandas_df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)
- pandas_df[new_col_id] = pandas.Series(value) if convert_to_series else value
- modin_df[new_col_id] = pd.Series(value) if convert_to_series else value
- df_equals(modin_df, pandas_df)
+ def applyier(df):
+ if convert_to_series:
+ converted_value = (
+ pandas.Series(value)
+ if isinstance(df, pandas.DataFrame)
+ else pd.Series(value)
+ )
+ else:
+ converted_value = value
+ df[new_col_id] = converted_value
+ return df
+
+ eval_general(modin_df, pandas_df, applyier)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -1253,3 +1374,17 @@ def test_index_order():
getattr(df_modin, func)(level=0).index,
getattr(df_pandas, func)(level=0).index,
)
+
+
+@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+@pytest.mark.parametrize("sortorder", [0, 3, 5])
+def test_multiindex_from_frame(data, sortorder):
+ modin_df, pandas_df = create_test_dfs(data)
+
+ def call_from_frame(df):
+ if type(df).__module__.startswith("pandas"):
+ return pandas.MultiIndex.from_frame(df, sortorder)
+ else:
+ return pd.MultiIndex.from_frame(df, sortorder)
+
+ eval_general(modin_df, pandas_df, call_from_frame, comparator=assert_index_equal)
diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py
index d35687a8123..3b9bfce6cf7 100644
--- a/modin/pandas/test/dataframe/test_iter.py
+++ b/modin/pandas/test/dataframe/test_iter.py
@@ -29,8 +29,9 @@
create_test_dfs,
test_data,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py
index c281f2b9db9..f3e015021c0 100644
--- a/modin/pandas/test/dataframe/test_join_sort.py
+++ b/modin/pandas/test/dataframe/test_join_sort.py
@@ -34,8 +34,9 @@
generate_multiindex,
eval_general,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
@@ -510,3 +511,43 @@ def test_where():
pandas_result = pandas_df.where(pandas_df < 2, True)
modin_result = modin_df.where(modin_df < 2, True)
assert all((to_pandas(modin_result) == pandas_result).all())
+
+
+@pytest.mark.parametrize("align_axis", ["index", "columns"])
+@pytest.mark.parametrize("keep_shape", [False, True])
+@pytest.mark.parametrize("keep_equal", [False, True])
+def test_compare(align_axis, keep_shape, keep_equal):
+ kwargs = {
+ "align_axis": align_axis,
+ "keep_shape": keep_shape,
+ "keep_equal": keep_equal,
+ }
+ frame_data1 = random_state.randn(100, 10)
+ frame_data2 = random_state.randn(100, 10)
+ pandas_df = pandas.DataFrame(frame_data1, columns=list("abcdefghij"))
+ pandas_df2 = pandas.DataFrame(frame_data2, columns=list("abcdefghij"))
+ modin_df = pd.DataFrame(frame_data1, columns=list("abcdefghij"))
+ modin_df2 = pd.DataFrame(frame_data2, columns=list("abcdefghij"))
+
+ modin_result = modin_df.compare(modin_df2, **kwargs)
+ pandas_result = pandas_df.compare(pandas_df2, **kwargs)
+ assert to_pandas(modin_result).equals(pandas_result)
+
+ modin_result = modin_df2.compare(modin_df, **kwargs)
+ pandas_result = pandas_df2.compare(pandas_df, **kwargs)
+ assert to_pandas(modin_result).equals(pandas_result)
+
+ series_data1 = ["a", "b", "c", "d", "e"]
+ series_data2 = ["a", "a", "c", "b", "e"]
+ pandas_series1 = pandas.Series(series_data1)
+ pandas_series2 = pandas.Series(series_data2)
+ modin_series1 = pd.Series(series_data1)
+ modin_series2 = pd.Series(series_data2)
+
+ modin_result = modin_series1.compare(modin_series2, **kwargs)
+ pandas_result = pandas_series1.compare(pandas_series2, **kwargs)
+ assert to_pandas(modin_result).equals(pandas_result)
+
+ modin_result = modin_series2.compare(modin_series1, **kwargs)
+ pandas_result = pandas_series2.compare(pandas_series1, **kwargs)
+ assert to_pandas(modin_result).equals(pandas_result)
diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py
index 5960612d37c..b7866ee18df 100644
--- a/modin/pandas/test/dataframe/test_map_metadata.py
+++ b/modin/pandas/test/dataframe/test_map_metadata.py
@@ -14,7 +14,7 @@
import pytest
import numpy as np
import pandas
-import pandas.util.testing as tm
+from pandas.testing import assert_index_equal
import matplotlib
import modin.pandas as pd
from modin.utils import get_current_backend
@@ -27,6 +27,7 @@
df_is_empty,
arg_keys,
name_contains,
+ test_data,
test_data_values,
test_data_keys,
test_data_with_duplicates_values,
@@ -45,15 +46,18 @@
eval_general,
create_test_dfs,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
def eval_insert(modin_df, pandas_df, **kwargs):
- _kwargs = {"loc": 0, "col": "New column"}
+ if "col" in kwargs and "column" not in kwargs:
+ kwargs["column"] = kwargs.pop("col")
+ _kwargs = {"loc": 0, "column": "New column"}
_kwargs.update(kwargs)
eval_general(
@@ -120,13 +124,13 @@ def test_indexing():
def test_empty_df():
df = pd.DataFrame(index=["a", "b"])
df_is_empty(df)
- tm.assert_index_equal(df.index, pd.Index(["a", "b"]))
+ assert_index_equal(df.index, pd.Index(["a", "b"]))
assert len(df.columns) == 0
df = pd.DataFrame(columns=["a", "b"])
df_is_empty(df)
assert len(df.index) == 0
- tm.assert_index_equal(df.columns, pd.Index(["a", "b"]))
+ assert_index_equal(df.columns, pd.Index(["a", "b"]))
df = pd.DataFrame()
df_is_empty(df)
@@ -135,13 +139,13 @@ def test_empty_df():
df = pd.DataFrame(index=["a", "b"])
df_is_empty(df)
- tm.assert_index_equal(df.index, pd.Index(["a", "b"]))
+ assert_index_equal(df.index, pd.Index(["a", "b"]))
assert len(df.columns) == 0
df = pd.DataFrame(columns=["a", "b"])
df_is_empty(df)
assert len(df.index) == 0
- tm.assert_index_equal(df.columns, pd.Index(["a", "b"]))
+ assert_index_equal(df.columns, pd.Index(["a", "b"]))
df = pd.DataFrame()
df_is_empty(df)
@@ -439,7 +443,7 @@ def test_append(data):
def test_astype():
- td = pandas.DataFrame(tm.getSeriesData())
+ td = pandas.DataFrame(test_data["int_data"])[["col1", "index", "col3", "col4"]]
modin_df = pd.DataFrame(td.values, index=td.index, columns=td.columns)
expected_df = pandas.DataFrame(td.values, index=td.index, columns=td.columns)
@@ -459,13 +463,13 @@ def test_astype():
expected_df_casted = expected_df.astype("category")
df_equals(modin_df_casted, expected_df_casted)
- dtype_dict = {"A": np.int32, "B": np.int64, "C": str}
+ dtype_dict = {"col1": np.int32, "index": np.int64, "col3": str}
modin_df_casted = modin_df.astype(dtype_dict)
expected_df_casted = expected_df.astype(dtype_dict)
df_equals(modin_df_casted, expected_df_casted)
# Ignore lint because this is testing bad input
- bad_dtype_dict = {"B": np.int32, "B": np.int64, "B": str} # noqa F601
+ bad_dtype_dict = {"index": np.int32, "index": np.int64, "index": str} # noqa F601
modin_df_casted = modin_df.astype(bad_dtype_dict)
expected_df_casted = expected_df.astype(bad_dtype_dict)
df_equals(modin_df_casted, expected_df_casted)
@@ -920,19 +924,7 @@ def test_dropna_subset_error(data, axis, subset):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
-@pytest.mark.parametrize(
- "astype",
- [
- "category",
- pytest.param(
- "int32",
- marks=pytest.mark.xfail(
- reason="Modin astype() does not raises ValueError at non-numeric argument when Pandas does."
- ),
- ),
- "float",
- ],
-)
+@pytest.mark.parametrize("astype", ["category", "int32", "float"])
def test_insert_dtypes(data, astype):
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
@@ -983,6 +975,12 @@ def test_insert(data):
col="DataFrame insert",
value=lambda df: df[[df.columns[0]]],
)
+ eval_insert(
+ modin_df,
+ pandas_df,
+ col="Different indices",
+ value=lambda df: df[[df.columns[0]]].set_index(df.index[::-1]),
+ )
# Bad inserts
eval_insert(modin_df, pandas_df, col="Bad Column", value=lambda df: df)
diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py
index 3fca2993f9a..70338267ff9 100644
--- a/modin/pandas/test/dataframe/test_reduction.py
+++ b/modin/pandas/test/dataframe/test_reduction.py
@@ -35,8 +35,9 @@
generate_multiindex,
test_data_diff_dtype,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
@@ -140,6 +141,26 @@ def test_describe(data, percentiles):
)
+@pytest.mark.parametrize("has_numeric_column", [False, True])
+@pytest.mark.parametrize("datetime_is_numeric", [True, False, None])
+def test_2195(datetime_is_numeric, has_numeric_column):
+ data = {
+ "categorical": pd.Categorical(["d"] * 10 ** 2),
+ "date": [np.datetime64("2000-01-01")] * 10 ** 2,
+ }
+
+ if has_numeric_column:
+ data.update({"numeric": [5] * 10 ** 2})
+
+ modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
+
+ eval_general(
+ modin_df,
+ pandas_df,
+ lambda df: df.describe(datetime_is_numeric=datetime_is_numeric),
+ )
+
+
@pytest.mark.parametrize(
"exclude,include",
[
@@ -285,26 +306,6 @@ def test_prod(
df_equals(modin_result, pandas_result)
-@pytest.mark.parametrize(
- "numeric_only",
- [
- pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")),
- False,
- True,
- ],
-)
-@pytest.mark.parametrize(
- "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys)
-)
-def test_prod_specific(min_count, numeric_only):
- if min_count == 5 and numeric_only:
- pytest.xfail("see #1953 for details")
- eval_general(
- *create_test_dfs(test_data_diff_dtype),
- lambda df: df.prod(min_count=min_count, numeric_only=numeric_only),
- )
-
-
@pytest.mark.parametrize("is_transposed", [False, True])
@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
@@ -333,19 +334,17 @@ def test_sum(data, axis, skipna, is_transposed):
df_equals(modin_result, pandas_result)
+@pytest.mark.parametrize("fn", ["prod, sum"])
@pytest.mark.parametrize(
- "numeric_only",
- [
- pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")),
- False,
- True,
- ],
+ "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)
)
-@pytest.mark.parametrize("min_count", int_arg_values)
-def test_sum_specific(min_count, numeric_only):
+@pytest.mark.parametrize(
+ "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys)
+)
+def test_sum_prod_specific(fn, min_count, numeric_only):
eval_general(
*create_test_dfs(test_data_diff_dtype),
- lambda df: df.sum(min_count=min_count, numeric_only=numeric_only),
+ lambda df: getattr(df, fn)(min_count=min_count, numeric_only=numeric_only),
)
@@ -355,3 +354,17 @@ def test_sum_single_column(data):
pandas_df = pandas.DataFrame(data).iloc[:, [0]]
df_equals(modin_df.sum(), pandas_df.sum())
df_equals(modin_df.sum(axis=1), pandas_df.sum(axis=1))
+
+
+@pytest.mark.parametrize(
+ "fn", ["max", "min", "median", "mean", "skew", "kurt", "sem", "std", "var"]
+)
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize(
+ "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)
+)
+def test_reduction_specific(fn, numeric_only, axis):
+ eval_general(
+ *create_test_dfs(test_data_diff_dtype),
+ lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),
+ )
diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py
index 4b39cf7cd22..1278c4f036c 100644
--- a/modin/pandas/test/dataframe/test_udf.py
+++ b/modin/pandas/test/dataframe/test_udf.py
@@ -34,13 +34,26 @@
udf_func_keys,
test_data,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
+def test_agg_dict():
+ md_df, pd_df = create_test_dfs(test_data_values[0])
+ agg_dict = {pd_df.columns[0]: "sum", pd_df.columns[-1]: ("sum", "count")}
+ eval_general(md_df, pd_df, lambda df: df.agg(agg_dict), raising_exceptions=True)
+
+ agg_dict = {
+ "new_col1": (pd_df.columns[0], "sum"),
+ "new_col2": (pd_df.columns[-1], "count"),
+ }
+ eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict), raising_exceptions=True)
+
+
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"func",
@@ -49,16 +62,10 @@
)
@pytest.mark.parametrize("op", ["agg", "apply"])
def test_agg_apply(axis, func, op):
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_dfs(test_data["float_nan_data"]),
- lambda df: getattr(df, op)(func, axis),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_dfs(test_data["float_nan_data"]),
+ lambda df: getattr(df, op)(func, axis),
+ )
@pytest.mark.parametrize("axis", ["rows", "columns"])
@@ -69,16 +76,10 @@ def test_agg_apply(axis, func, op):
)
@pytest.mark.parametrize("op", ["agg", "apply"])
def test_agg_apply_axis_names(axis, func, op):
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_dfs(test_data["int_data"]),
- lambda df: getattr(df, op)(func, axis),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_dfs(test_data["int_data"]),
+ lambda df: getattr(df, op)(func, axis),
+ )
def test_aggregate_alias():
diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py
index ff8ce9c5117..e64202c18dc 100644
--- a/modin/pandas/test/dataframe/test_window.py
+++ b/modin/pandas/test/dataframe/test_window.py
@@ -40,8 +40,9 @@
create_test_dfs,
test_data_diff_dtype,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py
index 319ae2bf505..09a5dced43a 100644
--- a/modin/pandas/test/test_api.py
+++ b/modin/pandas/test/test_api.py
@@ -45,9 +45,9 @@ def test_top_level_api_equality():
]
ignore_modin = [
- "DEFAULT_NPARTITIONS",
"iterator",
"series",
+ "accessor",
"base",
"utils",
"dataframe",
diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py
index 9747c3d5a27..061629e8fef 100644
--- a/modin/pandas/test/test_concat.py
+++ b/modin/pandas/test/test_concat.py
@@ -17,9 +17,16 @@
import modin.pandas as pd
from modin.pandas.utils import from_pandas
-from .utils import df_equals, generate_dfs, generate_multiindex_dfs, generate_none_dfs
+from .utils import (
+ df_equals,
+ generate_dfs,
+ generate_multiindex_dfs,
+ generate_none_dfs,
+ create_test_dfs,
+)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
def test_df_concat():
@@ -173,6 +180,18 @@ def test_concat_with_empty_frame():
pandas.concat([pandas_empty_df, pandas_row]),
)
+ md_empty1, pd_empty1 = create_test_dfs(index=[1, 2, 3])
+ md_empty2, pd_empty2 = create_test_dfs(index=[2, 3, 4])
+
+ df_equals(
+ pd.concat([md_empty1, md_empty2], axis=0),
+ pandas.concat([pd_empty1, pd_empty2], axis=0),
+ )
+ df_equals(
+ pd.concat([md_empty1, md_empty2], axis=1),
+ pandas.concat([pd_empty1, pd_empty2], axis=1),
+ )
+
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("names", [False, True])
diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py
index daef0bbee75..79144b5b879 100644
--- a/modin/pandas/test/test_general.py
+++ b/modin/pandas/test/test_general.py
@@ -16,6 +16,7 @@
import modin.pandas as pd
import numpy as np
from numpy.testing import assert_array_equal
+from modin.utils import get_current_backend, to_pandas
from .utils import test_data_values, test_data_keys, df_equals
@@ -216,6 +217,264 @@ def test_merge_asof():
)
+def test_merge_asof_on_variations():
+ """on=,left_on=,right_on=,right_index=,left_index= options match Pandas."""
+ left = {"a": [1, 5, 10], "left_val": ["a", "b", "c"]}
+ left_index = [6, 8, 12]
+ right = {"a": [1, 2, 3, 6, 7], "right_val": ["d", "e", "f", "g", "h"]}
+ right_index = [6, 7, 8, 9, 15]
+ pandas_left, pandas_right = (
+ pandas.DataFrame(left, index=left_index),
+ pandas.DataFrame(right, index=right_index),
+ )
+ modin_left, modin_right = pd.DataFrame(left, index=left_index), pd.DataFrame(
+ right, index=right_index
+ )
+ for on_arguments in [
+ {"on": "a"},
+ {"left_on": "a", "right_on": "a"},
+ {"left_on": "a", "right_index": True},
+ {"left_index": True, "right_on": "a"},
+ {"left_index": True, "right_index": True},
+ ]:
+ pandas_merged = pandas.merge_asof(pandas_left, pandas_right, **on_arguments)
+ modin_merged = pd.merge_asof(modin_left, modin_right, **on_arguments)
+ df_equals(pandas_merged, modin_merged)
+
+
+def test_merge_asof_suffixes():
+ """Suffix variations are handled the same as Pandas."""
+ left = {"a": [1, 5, 10]}
+ right = {"a": [2, 3, 6]}
+ pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right))
+ modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right)
+ for suffixes in [("a", "b"), (False, "c"), ("d", False)]:
+ pandas_merged = pandas.merge_asof(
+ pandas_left,
+ pandas_right,
+ left_index=True,
+ right_index=True,
+ suffixes=suffixes,
+ )
+ modin_merged = pd.merge_asof(
+ modin_left,
+ modin_right,
+ left_index=True,
+ right_index=True,
+ suffixes=suffixes,
+ )
+ df_equals(pandas_merged, modin_merged)
+
+ with pytest.raises(ValueError):
+ pandas.merge_asof(
+ pandas_left,
+ pandas_right,
+ left_index=True,
+ right_index=True,
+ suffixes=(False, False),
+ )
+ with pytest.raises(ValueError):
+ modin_merged = pd.merge_asof(
+ modin_left,
+ modin_right,
+ left_index=True,
+ right_index=True,
+ suffixes=(False, False),
+ )
+
+
+def test_merge_asof_bad_arguments():
+ left = {"a": [1, 5, 10], "b": [5, 7, 9]}
+ right = {"a": [2, 3, 6], "b": [6, 5, 20]}
+ pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right))
+ modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right)
+
+ # Can't mix by with left_by/right_by
+ with pytest.raises(ValueError):
+ pandas.merge_asof(
+ pandas_left, pandas_right, on="a", by="b", left_by="can't do with by"
+ )
+ with pytest.raises(ValueError):
+ pd.merge_asof(
+ modin_left, modin_right, on="a", by="b", left_by="can't do with by"
+ )
+ with pytest.raises(ValueError):
+ pandas.merge_asof(
+ pandas_left, pandas_right, by="b", on="a", right_by="can't do with by"
+ )
+ with pytest.raises(ValueError):
+ pd.merge_asof(
+ modin_left, modin_right, by="b", on="a", right_by="can't do with by"
+ )
+
+ # Can't mix on with left_on/right_on
+ with pytest.raises(ValueError):
+ pandas.merge_asof(pandas_left, pandas_right, on="a", left_on="can't do with by")
+ with pytest.raises(ValueError):
+ pd.merge_asof(modin_left, modin_right, on="a", left_on="can't do with by")
+ with pytest.raises(ValueError):
+ pandas.merge_asof(
+ pandas_left, pandas_right, on="a", right_on="can't do with by"
+ )
+ with pytest.raises(ValueError):
+ pd.merge_asof(modin_left, modin_right, on="a", right_on="can't do with by")
+
+ # Can't mix left_index with left_on or on, similarly for right.
+ with pytest.raises(ValueError):
+ pd.merge_asof(modin_left, modin_right, on="a", right_index=True)
+ with pytest.raises(ValueError):
+ pd.merge_asof(
+ modin_left, modin_right, left_on="a", right_on="a", right_index=True
+ )
+ with pytest.raises(ValueError):
+ pd.merge_asof(modin_left, modin_right, on="a", left_index=True)
+ with pytest.raises(ValueError):
+ pd.merge_asof(
+ modin_left, modin_right, left_on="a", right_on="a", left_index=True
+ )
+
+ # Need both left and right
+ with pytest.raises(Exception): # Pandas bug, didn't validate inputs sufficiently
+ pandas.merge_asof(pandas_left, pandas_right, left_on="a")
+ with pytest.raises(ValueError):
+ pd.merge_asof(modin_left, modin_right, left_on="a")
+ with pytest.raises(Exception): # Pandas bug, didn't validate inputs sufficiently
+ pandas.merge_asof(pandas_left, pandas_right, right_on="a")
+ with pytest.raises(ValueError):
+ pd.merge_asof(modin_left, modin_right, right_on="a")
+ with pytest.raises(ValueError):
+ pandas.merge_asof(pandas_left, pandas_right)
+ with pytest.raises(ValueError):
+ pd.merge_asof(modin_left, modin_right)
+
+
+def test_merge_asof_merge_options():
+ modin_quotes = pd.DataFrame(
+ {
+ "time": [
+ pd.Timestamp("2016-05-25 13:30:00.023"),
+ pd.Timestamp("2016-05-25 13:30:00.023"),
+ pd.Timestamp("2016-05-25 13:30:00.030"),
+ pd.Timestamp("2016-05-25 13:30:00.041"),
+ pd.Timestamp("2016-05-25 13:30:00.048"),
+ pd.Timestamp("2016-05-25 13:30:00.049"),
+ pd.Timestamp("2016-05-25 13:30:00.072"),
+ pd.Timestamp("2016-05-25 13:30:00.075"),
+ ],
+ "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
+ "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
+ "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
+ }
+ )
+ modin_trades = pd.DataFrame(
+ {
+ "time": [
+ pd.Timestamp("2016-05-25 13:30:00.023"),
+ pd.Timestamp("2016-05-25 13:30:00.038"),
+ pd.Timestamp("2016-05-25 13:30:00.048"),
+ pd.Timestamp("2016-05-25 13:30:00.048"),
+ pd.Timestamp("2016-05-25 13:30:00.048"),
+ ],
+ "ticker2": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
+ "price": [51.95, 51.95, 720.77, 720.92, 98.0],
+ "quantity": [75, 155, 100, 100, 100],
+ }
+ )
+ pandas_quotes, pandas_trades = to_pandas(modin_quotes), to_pandas(modin_trades)
+
+ # left_by + right_by
+ df_equals(
+ pandas.merge_asof(
+ pandas_quotes,
+ pandas_trades,
+ on="time",
+ left_by="ticker",
+ right_by="ticker2",
+ ),
+ pd.merge_asof(
+ modin_quotes,
+ modin_trades,
+ on="time",
+ left_by="ticker",
+ right_by="ticker2",
+ ),
+ )
+
+ # Just by:
+ pandas_trades["ticker"] = pandas_trades["ticker2"]
+ modin_trades["ticker"] = modin_trades["ticker2"]
+ df_equals(
+ pandas.merge_asof(
+ pandas_quotes,
+ pandas_trades,
+ on="time",
+ by="ticker",
+ ),
+ pd.merge_asof(
+ modin_quotes,
+ modin_trades,
+ on="time",
+ by="ticker",
+ ),
+ )
+
+ # Tolerance
+ df_equals(
+ pandas.merge_asof(
+ pandas_quotes,
+ pandas_trades,
+ on="time",
+ by="ticker",
+ tolerance=pd.Timedelta("2ms"),
+ ),
+ pd.merge_asof(
+ modin_quotes,
+ modin_trades,
+ on="time",
+ by="ticker",
+ tolerance=pd.Timedelta("2ms"),
+ ),
+ )
+
+ # Direction
+ df_equals(
+ pandas.merge_asof(
+ pandas_quotes,
+ pandas_trades,
+ on="time",
+ by="ticker",
+ direction="forward",
+ ),
+ pd.merge_asof(
+ modin_quotes,
+ modin_trades,
+ on="time",
+ by="ticker",
+ direction="forward",
+ ),
+ )
+
+ # Allow exact matches
+ df_equals(
+ pandas.merge_asof(
+ pandas_quotes,
+ pandas_trades,
+ on="time",
+ by="ticker",
+ tolerance=pd.Timedelta("10ms"),
+ allow_exact_matches=False,
+ ),
+ pd.merge_asof(
+ modin_quotes,
+ modin_trades,
+ on="time",
+ by="ticker",
+ tolerance=pd.Timedelta("10ms"),
+ allow_exact_matches=False,
+ ),
+ )
+
+
def test_pivot():
test_df = pd.DataFrame(
{
@@ -269,10 +528,12 @@ def test_unique():
modin_result = pd.unique([2, 1, 3, 3])
pandas_result = pandas.unique([2, 1, 3, 3])
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.unique(pd.Series([2] + [1] * 5))
pandas_result = pandas.unique(pandas.Series([2] + [1] * 5))
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.unique(
pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])
@@ -281,6 +542,7 @@ def test_unique():
pandas.Series([pandas.Timestamp("20160101"), pandas.Timestamp("20160101")])
)
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.unique(
pd.Series(
@@ -299,6 +561,7 @@ def test_unique():
)
)
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.unique(
pd.Index(
@@ -317,10 +580,12 @@ def test_unique():
)
)
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.unique(pd.Series(pd.Categorical(list("baabc"))))
pandas_result = pandas.unique(pandas.Series(pandas.Categorical(list("baabc"))))
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
@pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)])
@@ -351,23 +616,29 @@ def sort_index_for_equal_values(result, ascending):
else:
new_index[j] = result.index[j]
i += 1
- return pandas.Series(result, index=new_index)
+ return type(result)(result, index=new_index)
- # We sort indices for pandas result because of issue #1650
+ # We sort indices for Modin and pandas result because of issue #1650
values = np.array([3, 1, 2, 3, 4, np.nan])
- modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
+ modin_result = sort_index_for_equal_values(
+ pd.value_counts(values, normalize=normalize, ascending=False), False
+ )
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, normalize=normalize, ascending=False), False
)
df_equals(modin_result, pandas_result)
- modin_result = pd.value_counts(values, bins=bins, ascending=False)
+ modin_result = sort_index_for_equal_values(
+ pd.value_counts(values, bins=bins, ascending=False), False
+ )
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, bins=bins, ascending=False), False
)
df_equals(modin_result, pandas_result)
- modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
+ modin_result = sort_index_for_equal_values(
+ pd.value_counts(values, dropna=dropna, ascending=True), True
+ )
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, dropna=dropna, ascending=True), True
)
@@ -437,3 +708,29 @@ def test_to_pandas_indices():
assert md_df.axes[axis].equal_levels(
pd_df.axes[axis]
), f"Levels of indices at axis {axis} are different!"
+
+
+@pytest.mark.skipif(
+ get_current_backend() != "BaseOnPython",
+ reason="This test make sense only on BaseOnPython backend.",
+)
+@pytest.mark.parametrize(
+ "func, regex",
+ [
+ (lambda df: df.mean(level=0), r"DataFrame\.mean"),
+ (lambda df: df + df, r"DataFrame\.add"),
+ (lambda df: df.index, r"DataFrame\.get_axis\(0\)"),
+ (
+ lambda df: df.drop(columns="col1").squeeze().repeat(2),
+ r"Series\.repeat",
+ ),
+ (lambda df: df.groupby("col1").prod(), r"GroupBy\.prod"),
+ (lambda df: df.rolling(1).count(), r"Rolling\.count"),
+ ],
+)
+def test_default_to_pandas_warning_message(func, regex):
+ data = {"col1": [1, 2, 3], "col2": [4, 5, 6]}
+ df = pd.DataFrame(data)
+
+ with pytest.warns(UserWarning, match=regex):
+ func(df)
diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py
index bbd5efb9f24..13cfdcd60b2 100644
--- a/modin/pandas/test/test_groupby.py
+++ b/modin/pandas/test/test_groupby.py
@@ -16,17 +16,20 @@
import numpy as np
import modin.pandas as pd
from modin.utils import try_cast_to_pandas, get_current_backend
-from modin.pandas.utils import from_pandas
+from modin.pandas.utils import from_pandas, is_scalar
from .utils import (
df_equals,
check_df_columns_have_nans,
create_test_dfs,
eval_general,
+ test_data,
test_data_values,
modin_df_almost_equals_pandas,
+ generate_multiindex,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
def modin_groupby_equals_pandas(modin_groupby, pandas_groupby):
@@ -49,6 +52,17 @@ def eval_aggregation(md_df, pd_df, operation=None, by=None, *args, **kwargs):
)
+def build_types_asserter(comparator):
+ def wrapper(obj1, obj2, *args, **kwargs):
+ error_str = f"obj1 and obj2 has incorrect types: {type(obj1)} and {type(obj2)}"
+ assert not (is_scalar(obj1) ^ is_scalar(obj2)), error_str
+ assert obj1.__module__.split(".")[0] == "modin", error_str
+ assert obj2.__module__.split(".")[0] == "pandas", error_str
+ comparator(obj1, obj2, *args, **kwargs)
+
+ return wrapper
+
+
@pytest.mark.parametrize("as_index", [True, False])
def test_mixed_dtypes_groupby(as_index):
frame_data = np.random.randint(97, 198, size=(2 ** 6, 2 ** 4))
@@ -93,6 +107,7 @@ def test_mixed_dtypes_groupby(as_index):
modin_df_almost_equals_pandas,
is_default=True,
)
+ eval_shift(modin_groupby, pandas_groupby)
eval_mean(modin_groupby, pandas_groupby)
eval_any(modin_groupby, pandas_groupby)
eval_min(modin_groupby, pandas_groupby)
@@ -137,7 +152,18 @@ def test_mixed_dtypes_groupby(as_index):
eval_var(modin_groupby, pandas_groupby)
eval_skew(modin_groupby, pandas_groupby)
- agg_functions = ["min", "max"]
+ agg_functions = [
+ lambda df: df.sum(),
+ "min",
+ min,
+ "max",
+ max,
+ sum,
+ {"col2": "sum"},
+ {"col2": sum},
+ {"col2": "max", "col4": "sum", "col5": "min"},
+ {"col2": max, "col4": sum, "col5": "min"},
+ ]
for func in agg_functions:
eval_agg(modin_groupby, pandas_groupby, func)
eval_aggregate(modin_groupby, pandas_groupby, func)
@@ -216,7 +242,7 @@ def __call__(self, df):
["col1"],
# col2 contains NaN, is it necessary to test functions like size()
"col2",
- ["col2"],
+ ["col2"], # 5
pytest.param(
["col1", "col2"],
marks=pytest.mark.xfail(reason="Excluded because of bug #1554"),
@@ -234,12 +260,12 @@ def __call__(self, df):
marks=pytest.mark.xfail(reason="Excluded because of bug #1554"),
),
# but cum* functions produce undefined results with NaNs so we need to test the same combinations without NaN too
- ["col5"],
+ ["col5"], # 10
["col1", "col5"],
["col5", "col4"],
["col4", "col5"],
["col5", "col4", "col1"],
- ["col1", pd.Series([1, 5, 7, 8])],
+ ["col1", pd.Series([1, 5, 7, 8])], # 15
[pd.Series([1, 5, 7, 8])],
[
pd.Series([1, 5, 7, 8]),
@@ -250,7 +276,7 @@ def __call__(self, df):
],
["col1", GetColumn("col5")],
[GetColumn("col1"), GetColumn("col5")],
- [GetColumn("col1")],
+ [GetColumn("col1")], # 20
],
)
@pytest.mark.parametrize("as_index", [True, False])
@@ -287,6 +313,7 @@ def maybe_get_columns(df, by):
modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
eval_ngroups(modin_groupby, pandas_groupby)
+ eval_shift(modin_groupby, pandas_groupby)
eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True)
eval_general(
modin_groupby,
@@ -317,7 +344,11 @@ def maybe_get_columns(df, by):
# Workaround for Pandas bug #34656. Recreate groupby object for Pandas
pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index)
- apply_functions = [lambda df: df.sum(), min]
+ apply_functions = [
+ lambda df: df.sum(),
+ lambda df: pandas.Series([1, 2, 3, 4], name="result"),
+ min,
+ ]
for func in apply_functions:
eval_apply(modin_groupby, pandas_groupby, func)
@@ -334,7 +365,7 @@ def maybe_get_columns(df, by):
eval_var(modin_groupby, pandas_groupby)
eval_skew(modin_groupby, pandas_groupby)
- agg_functions = ["min", "max"]
+ agg_functions = [lambda df: df.sum(), "min", "max", min, sum]
for func in agg_functions:
eval_agg(modin_groupby, pandas_groupby, func)
eval_aggregate(modin_groupby, pandas_groupby, func)
@@ -352,7 +383,15 @@ def maybe_get_columns(df, by):
eval_len(modin_groupby, pandas_groupby)
eval_sum(modin_groupby, pandas_groupby)
eval_ngroup(modin_groupby, pandas_groupby)
- eval_general(modin_groupby, pandas_groupby, lambda df: df.nunique())
+ # Pandas raising exception when 'by' contains categorical key and `as_index=False`
+ # because of a bug: https://github.com/pandas-dev/pandas/issues/36698
+ # Modin correctly processes the result, so that's why `check_exception_type=None` in some cases
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda df: df.nunique(),
+ check_exception_type=None if (col1_category and not as_index) else True,
+ )
eval_median(modin_groupby, pandas_groupby)
eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True)
eval_general(
@@ -426,6 +465,7 @@ def test_single_group_row_groupby():
modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
eval_ngroups(modin_groupby, pandas_groupby)
+ eval_shift(modin_groupby, pandas_groupby)
eval_skew(modin_groupby, pandas_groupby)
eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True)
eval_general(
@@ -465,7 +505,15 @@ def test_single_group_row_groupby():
eval_prod(modin_groupby, pandas_groupby)
eval_std(modin_groupby, pandas_groupby)
- agg_functions = ["min", "max"]
+ agg_functions = [
+ lambda df: df.sum(),
+ "min",
+ "max",
+ max,
+ sum,
+ {"col2": "sum"},
+ {"col2": "max", "col4": "sum", "col5": "min"},
+ ]
for func in agg_functions:
eval_agg(modin_groupby, pandas_groupby, func)
eval_aggregate(modin_groupby, pandas_groupby, func)
@@ -541,6 +589,7 @@ def test_large_row_groupby(is_by_category):
modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
eval_ngroups(modin_groupby, pandas_groupby)
+ eval_shift(modin_groupby, pandas_groupby)
eval_skew(modin_groupby, pandas_groupby)
eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True)
eval_general(
@@ -580,7 +629,16 @@ def test_large_row_groupby(is_by_category):
# eval_prod(modin_groupby, pandas_groupby) causes overflows
eval_std(modin_groupby, pandas_groupby)
- agg_functions = ["min", "max"]
+ agg_functions = [
+ lambda df: df.sum(),
+ "min",
+ "max",
+ min,
+ sum,
+ {"A": "sum"},
+ {"A": lambda df: df.sum()},
+ {"A": "max", "B": "sum", "C": "min"},
+ ]
for func in agg_functions:
eval_agg(modin_groupby, pandas_groupby, func)
eval_aggregate(modin_groupby, pandas_groupby, func)
@@ -655,6 +713,7 @@ def test_simple_col_groupby():
modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
eval_ngroups(modin_groupby, pandas_groupby)
+ eval_shift(modin_groupby, pandas_groupby)
eval_skew(modin_groupby, pandas_groupby)
eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True)
eval_general(
@@ -785,6 +844,7 @@ def test_series_groupby(by, as_index_series_or_dataframe):
modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
eval_ngroups(modin_groupby, pandas_groupby)
+ eval_shift(modin_groupby, pandas_groupby)
eval_general(
modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True
)
@@ -835,7 +895,7 @@ def test_series_groupby(by, as_index_series_or_dataframe):
eval_var(modin_groupby, pandas_groupby)
eval_skew(modin_groupby, pandas_groupby)
- agg_functions = ["min", "max"]
+ agg_functions = [lambda df: df.sum(), "min", "max", max, sum]
for func in agg_functions:
eval_agg(modin_groupby, pandas_groupby, func)
eval_aggregate(modin_groupby, pandas_groupby, func)
@@ -1038,16 +1098,18 @@ def eval_quantile(modin_groupby, pandas_groupby):
def eval___getattr__(modin_groupby, pandas_groupby, item):
- try:
- pandas_groupby = pandas_groupby[item]
- pandas_result = pandas_groupby.count()
- except Exception as e:
- with pytest.raises(type(e)):
- modin_groupby[item].count()
- else:
- modin_groupby = modin_groupby[item]
- modin_result = modin_groupby.count()
- df_equals(modin_result, pandas_result)
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda grp: grp[item].count(),
+ comparator=build_types_asserter(df_equals),
+ )
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda grp: getattr(grp, item).count(),
+ comparator=build_types_asserter(df_equals),
+ )
def eval_groups(modin_groupby, pandas_groupby):
@@ -1056,7 +1118,26 @@ def eval_groups(modin_groupby, pandas_groupby):
def eval_shift(modin_groupby, pandas_groupby):
- assert modin_groupby.groups == pandas_groupby.groups
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda groupby: groupby.shift(),
+ )
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda groupby: groupby.shift(periods=0),
+ )
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda groupby: groupby.shift(periods=-3),
+ )
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda groupby: groupby.shift(axis=1, fill_value=777),
+ )
def test_groupby_on_index_values_with_loop():
@@ -1109,28 +1190,134 @@ def test_groupby_multiindex():
df_equals(modin_df.groupby(by=by).count(), pandas_df.groupby(by=by).count())
-@pytest.mark.skip("See Modin issue #2254 for details")
-def test_agg_func_None_rename():
+@pytest.mark.parametrize("groupby_axis", [0, 1])
+@pytest.mark.parametrize("shift_axis", [0, 1])
+def test_shift_freq(groupby_axis, shift_axis):
pandas_df = pandas.DataFrame(
{
- "col1": np.random.randint(0, 100, size=1000),
- "col2": np.random.randint(0, 100, size=1000),
- "col3": np.random.randint(0, 100, size=1000),
- "col4": np.random.randint(0, 100, size=1000),
- },
- index=["row{}".format(i) for i in range(1000)],
+ "col1": [1, 0, 2, 3],
+ "col2": [4, 5, np.NaN, 7],
+ "col3": [np.NaN, np.NaN, 12, 10],
+ "col4": [17, 13, 16, 15],
+ }
)
modin_df = from_pandas(pandas_df)
- modin_result = modin_df.groupby(["col1", "col2"]).agg(
- max=("col3", np.max), min=("col3", np.min)
+ new_index = pandas.date_range("1/12/2020", periods=4, freq="S")
+ if groupby_axis == 0 and shift_axis == 0:
+ pandas_df.index = modin_df.index = new_index
+ by = [["col2", "col3"], ["col2"], ["col4"], [0, 1, 0, 2]]
+ else:
+ pandas_df.index = modin_df.index = new_index
+ pandas_df.columns = modin_df.columns = new_index
+ by = [[0, 1, 0, 2]]
+
+ for _by in by:
+ pandas_groupby = pandas_df.groupby(by=_by, axis=groupby_axis)
+ modin_groupby = modin_df.groupby(by=_by, axis=groupby_axis)
+ eval_general(
+ modin_groupby,
+ pandas_groupby,
+ lambda groupby: groupby.shift(axis=shift_axis, freq="S"),
+ )
+
+
+@pytest.mark.parametrize(
+ "by_and_agg_dict",
+ [
+ {
+ "by": [
+ list(test_data["int_data"].keys())[0],
+ list(test_data["int_data"].keys())[1],
+ ],
+ "agg_dict": {
+ "max": (list(test_data["int_data"].keys())[2], np.max),
+ "min": (list(test_data["int_data"].keys())[2], np.min),
+ },
+ },
+ {
+ "by": ["col1"],
+ "agg_dict": {
+ "max": (list(test_data["int_data"].keys())[0], np.max),
+ "min": (list(test_data["int_data"].keys())[-1], np.min),
+ },
+ },
+ {
+ "by": [
+ list(test_data["int_data"].keys())[0],
+ list(test_data["int_data"].keys())[-1],
+ ],
+ "agg_dict": {
+ "max": (list(test_data["int_data"].keys())[1], max),
+ "min": (list(test_data["int_data"].keys())[-2], min),
+ },
+ },
+ pytest.param(
+ {
+ "by": [
+ list(test_data["int_data"].keys())[0],
+ list(test_data["int_data"].keys())[-1],
+ ],
+ "agg_dict": {
+ "max": (list(test_data["int_data"].keys())[1], max),
+ "min": (list(test_data["int_data"].keys())[-1], min),
+ },
+ },
+ marks=pytest.mark.skip("See Modin issue #2542"),
+ ),
+ ],
+)
+@pytest.mark.parametrize("as_index", [True, False])
+def test_agg_func_None_rename(by_and_agg_dict, as_index):
+ modin_df, pandas_df = create_test_dfs(test_data["int_data"])
+
+ modin_result = modin_df.groupby(by_and_agg_dict["by"], as_index=as_index).agg(
+ **by_and_agg_dict["agg_dict"]
)
- pandas_result = pandas_df.groupby(["col1", "col2"]).agg(
- max=("col3", np.max), min=("col3", np.min)
+ pandas_result = pandas_df.groupby(by_and_agg_dict["by"], as_index=as_index).agg(
+ **by_and_agg_dict["agg_dict"]
)
df_equals(modin_result, pandas_result)
+@pytest.mark.parametrize(
+ "as_index",
+ [
+ True,
+ pytest.param(
+ False,
+ marks=pytest.mark.xfail_backends(
+ ["BaseOnPython"], reason="See Pandas issue #39103"
+ ),
+ ),
+ ],
+)
+@pytest.mark.parametrize("by_length", [1, 3])
+@pytest.mark.parametrize(
+ "agg_fns",
+ [["sum", "min", "max"], ["mean", "quantile"]],
+ ids=["reduction", "aggregation"],
+)
+def test_dict_agg_rename_mi_columns(as_index, by_length, agg_fns):
+ md_df, pd_df = create_test_dfs(test_data["int_data"])
+ mi_columns = generate_multiindex(len(md_df.columns), nlevels=4)
+
+ md_df.columns, pd_df.columns = mi_columns, mi_columns
+
+ by = list(md_df.columns[:by_length])
+ agg_cols = list(md_df.columns[by_length : by_length + 3])
+
+ agg_dict = {
+ f"custom-{i}" + str(agg_fns[i % len(agg_fns)]): (col, agg_fns[i % len(agg_fns)])
+ for i, col in enumerate(agg_cols)
+ }
+
+ md_res = md_df.groupby(by, as_index=as_index).agg(**agg_dict)
+ pd_res = pd_df.groupby(by, as_index=as_index).agg(**agg_dict)
+
+ df_equals(md_res, pd_res)
+
+
@pytest.mark.parametrize(
"operation",
[
@@ -1176,6 +1363,12 @@ def test_agg_exceptions(operation):
eval_aggregation(*create_test_dfs(data), operation=operation)
+@pytest.mark.skip(
+ "Pandas raises a ValueError on empty dictionary aggregation since 1.2.0"
+ "It's unclear is that was made on purpose or it is a bug. That question"
+ "was asked in https://github.com/pandas-dev/pandas/issues/39609."
+ "So until the answer this test is disabled."
+)
@pytest.mark.parametrize(
"kwargs",
[
@@ -1238,9 +1431,11 @@ def get_columns(df):
[(True, "a"), (True, "b")],
[(False, "a"), (False, "b"), (True, "c")],
[(False, "a"), (True, "c")],
+ [(False, "a"), (True, "c"), (False, [1, 1, 2])],
],
)
-def test_mixed_columns_not_from_df(columns):
+@pytest.mark.parametrize("as_index", [True, False])
+def test_mixed_columns_not_from_df(columns, as_index):
"""
Unlike the previous test, in this case the Series is not just a column from
the original DataFrame, so you can't use a fasttrack.
@@ -1250,15 +1445,18 @@ def get_columns(df):
return [(df[name] + 1) if lookup else name for (lookup, name) in columns]
data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]}
+ groupby_kw = {"as_index": as_index}
- df1 = pandas.DataFrame(data)
- df1 = pandas.concat([df1])
- ref = df1.groupby(get_columns(df1)).size()
+ md_df, pd_df = create_test_dfs(data)
+ by_md, by_pd = map(get_columns, [md_df, pd_df])
- df2 = pd.DataFrame(data)
- df2 = pd.concat([df2])
- exp = df2.groupby(get_columns(df2)).size()
- df_equals(ref, exp)
+ pd_grp = pd_df.groupby(by_pd, **groupby_kw)
+ md_grp = md_df.groupby(by_md, **groupby_kw)
+
+ modin_groupby_equals_pandas(md_grp, pd_grp)
+ eval_general(md_grp, pd_grp, lambda grp: grp.size())
+ eval_general(md_grp, pd_grp, lambda grp: grp.apply(lambda df: df.sum()))
+ eval_general(md_grp, pd_grp, lambda grp: grp.first())
@pytest.mark.parametrize(
@@ -1293,18 +1491,90 @@ def get_columns(df):
"func_to_apply",
[
lambda df: df.sum(),
- lambda df: df.count(),
lambda df: df.size(),
- lambda df: df.mean(),
lambda df: df.quantile(),
+ lambda df: df.dtypes,
+ lambda df: df.apply(lambda df: df.sum()),
+ pytest.param(
+ lambda df: df.apply(lambda df: pandas.Series([1, 2, 3, 4])),
+ marks=pytest.mark.skip("See modin issue #2511"),
+ ),
+ lambda grp: grp.agg(
+ {
+ list(test_data_values[0].keys())[1]: (max, min, sum),
+ list(test_data_values[0].keys())[-2]: (sum, min, max),
+ }
+ ),
+ lambda grp: grp.agg(
+ {
+ list(test_data_values[0].keys())[1]: [
+ ("new_sum", "sum"),
+ ("new_min", "min"),
+ ],
+ list(test_data_values[0].keys())[-2]: np.sum,
+ }
+ ),
+ pytest.param(
+ lambda grp: grp.agg(
+ {
+ list(test_data_values[0].keys())[1]: (max, min, sum),
+ list(test_data_values[0].keys())[-1]: (sum, min, max),
+ }
+ ),
+ marks=pytest.mark.skip("See modin issue #2542"),
+ ),
],
)
-def test_multi_column_groupby_different_partitions(func_to_apply):
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize("by_length", [1, 2])
+@pytest.mark.parametrize(
+ "categorical_by",
+ [pytest.param(True, marks=pytest.mark.skip("See modin issue #2513")), False],
+)
+def test_multi_column_groupby_different_partitions(
+ func_to_apply, as_index, by_length, categorical_by
+):
data = test_data_values[0]
md_df, pd_df = create_test_dfs(data)
- # columns that will be located in a different partitions
- by = [pd_df.columns[0], pd_df.columns[-1]]
+ by = [pd_df.columns[-i if i % 2 else i] for i in range(by_length)]
- md_grp, pd_grp = md_df.groupby(by), pd_df.groupby(by)
+ if categorical_by:
+ md_df = md_df.astype({by[0]: "category"})
+ pd_df = pd_df.astype({by[0]: "category"})
+
+ md_grp, pd_grp = md_df.groupby(by, as_index=as_index), pd_df.groupby(
+ by, as_index=as_index
+ )
eval_general(md_grp, pd_grp, func_to_apply)
+
+
+@pytest.mark.parametrize(
+ "by",
+ [
+ 0,
+ 1.5,
+ "str",
+ pandas.Timestamp("2020-02-02"),
+ [None],
+ [0, "str"],
+ [None, 0],
+ [pandas.Timestamp("2020-02-02"), 1.5],
+ ],
+)
+@pytest.mark.parametrize("as_index", [True, False])
+def test_not_str_by(by, as_index):
+ data = {f"col{i}": np.arange(5) for i in range(5)}
+ columns = pandas.Index([0, 1.5, "str", pandas.Timestamp("2020-02-02"), None])
+
+ md_df, pd_df = create_test_dfs(data, columns=columns)
+ md_grp, pd_grp = md_df.groupby(by, as_index=as_index), pd_df.groupby(
+ by, as_index=as_index
+ )
+
+ modin_groupby_equals_pandas(md_grp, pd_grp)
+ df_equals(md_grp.sum(), pd_grp.sum())
+ df_equals(md_grp.size(), pd_grp.size())
+ df_equals(md_grp.agg(lambda df: df.mean()), pd_grp.agg(lambda df: df.mean()))
+ df_equals(md_grp.dtypes, pd_grp.dtypes)
+ df_equals(md_grp.first(), pd_grp.first())
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index 197a03fb6dd..b0f09de57d2 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -16,145 +16,59 @@
import pandas
from pandas.errors import ParserWarning
from collections import OrderedDict
+from modin.config import TestDatasetSize
from modin.utils import to_pandas
from modin.pandas.utils import from_arrow
-from pathlib import Path
import pyarrow as pa
-import pyarrow.parquet as pq
import os
import shutil
import sqlalchemy as sa
import csv
+import tempfile
from .utils import (
+ check_file_leaks,
df_equals,
json_short_string,
json_short_bytes,
json_long_string,
json_long_bytes,
- eval_general,
+ eval_io,
+ get_unique_filename,
+ io_ops_bad_exc,
+ eval_io_from_str,
+ dummy_decorator,
+ create_test_dfs,
+ COMP_TO_EXT,
+ teardown_test_files,
)
-from modin.config import Engine, Backend
+from modin.config import Engine, Backend, IsExperimental
if Backend.get() == "Pandas":
import modin.pandas as pd
else:
import modin.experimental.pandas as pd
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
-
-TEST_PARQUET_FILENAME = "test.parquet"
-TEST_CSV_FILENAME = "test.csv"
-TEST_JSON_FILENAME = "test.json"
-TEST_HTML_FILENAME = "test.html"
-TEST_EXCEL_FILENAME = "test.xlsx"
-TEST_FEATHER_FILENAME = "test.feather"
-TEST_READ_HDF_FILENAME = "test.hdf"
-TEST_WRITE_HDF_FILENAME_MODIN = "test_write_modin.hdf"
-TEST_WRITE_HDF_FILENAME_PANDAS = "test_write_pandas.hdf"
-TEST_STATA_FILENAME = "test.dta"
-TEST_PICKLE_FILENAME = "test.pkl"
-TEST_SAS_FILENAME = os.getcwd() + "/data/test1.sas7bdat"
-TEST_FWF_FILENAME = "test_fwf.txt"
-TEST_GBQ_FILENAME = "test_gbq."
-SMALL_ROW_SIZE = 2000
-
-
-def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs):
- def applyier(module, *args, **kwargs):
- result = getattr(module, fn_name)(*args, **kwargs)
- # There could be some missmatches in dtypes, so we're
- # casting the whole frame to `str` before comparison.
- # See issue #1931 for details.
- if cast_to_str:
- result = result.astype(str)
- return result
-
- eval_general(
- pd,
- pandas,
- applyier,
- path=path,
- *args,
- **kwargs,
- )
-
-
-@pytest.fixture
-def make_parquet_file():
- """Pytest fixture factory that makes a parquet file/dir for testing.
-
- Yields:
- Function that generates a parquet file/dir
- """
-
- def _make_parquet_file(
- row_size=SMALL_ROW_SIZE, force=False, directory=False, partitioned_columns=[]
- ):
- """Helper function to generate parquet files/directories.
-
- Args:
- row_size: Number of rows for the dataframe.
- force: Create a new file/directory even if one already exists.
- directory: Create a partitioned directory using pyarrow.
- partitioned_columns: Create a partitioned directory using pandas.
- Will be ignored if directory=True.
- """
- df = pandas.DataFrame(
- {"col1": np.arange(row_size), "col2": np.arange(row_size)}
- )
- if os.path.exists(TEST_PARQUET_FILENAME) and not force:
- pass
- elif directory:
- if os.path.exists(TEST_PARQUET_FILENAME):
- shutil.rmtree(TEST_PARQUET_FILENAME)
- else:
- os.mkdir(TEST_PARQUET_FILENAME)
- table = pa.Table.from_pandas(df)
- pq.write_to_dataset(table, root_path=TEST_PARQUET_FILENAME)
- elif len(partitioned_columns) > 0:
- df.to_parquet(TEST_PARQUET_FILENAME, partition_cols=partitioned_columns)
- else:
- df.to_parquet(TEST_PARQUET_FILENAME)
-
- # Return function that generates csv files
- yield _make_parquet_file
-
- # Delete parquet file that was created
- if os.path.exists(TEST_PARQUET_FILENAME):
- if os.path.isdir(TEST_PARQUET_FILENAME):
- shutil.rmtree(TEST_PARQUET_FILENAME)
- else:
- os.remove(TEST_PARQUET_FILENAME)
-
-
-def create_test_modin_dataframe():
- df = pd.DataFrame(
- {
- "col1": [0, 1, 2, 3],
- "col2": [4, 5, 6, 7],
- "col3": [8, 9, 10, 11],
- "col4": [12, 13, 14, 15],
- "col5": [0, 0, 0, 0],
- }
- )
+NPartitions.put(4)
- return df
+DATASET_SIZE_DICT = {
+ "Small": 64,
+ "Normal": 2000,
+ "Big": 20000,
+}
+# Number of rows in the test file
+NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["Small"])
-def create_test_pandas_dataframe():
- df = pandas.DataFrame(
- {
- "col1": [0, 1, 2, 3],
- "col2": [4, 5, 6, 7],
- "col3": [8, 9, 10, 11],
- "col4": [12, 13, 14, 15],
- "col5": [0, 0, 0, 0],
- }
- )
-
- return df
+TEST_DATA = {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
+}
def assert_files_eq(path1, path2):
@@ -168,225 +82,93 @@ def assert_files_eq(path1, path2):
return False
-def teardown_test_file(test_path):
- if os.path.exists(test_path):
- os.remove(test_path)
-
-
-@pytest.fixture
-def make_csv_file(delimiter=",", compression="infer"):
- """Pytest fixture factory that makes temp csv files for testing.
-
- Yields:
- Function that generates csv files
- """
- filenames = []
-
- def _make_csv_file(
- filename=TEST_CSV_FILENAME,
- row_size=SMALL_ROW_SIZE,
- force=True,
- delimiter=delimiter,
- encoding=None,
- compression=compression,
- ):
- if os.path.exists(filename) and not force:
- pass
- else:
- dates = pandas.date_range("2000", freq="h", periods=row_size)
- df = pandas.DataFrame(
- {
- "col1": np.arange(row_size),
- "col2": [str(x.date()) for x in dates],
- "col3": np.arange(row_size),
- "col4": [str(x.time()) for x in dates],
- }
- )
- if compression == "gzip":
- filename = "{}.gz".format(filename)
- elif compression == "zip" or compression == "xz" or compression == "bz2":
- filename = "{fname}.{comp}".format(fname=filename, comp=compression)
-
- df.to_csv(
- filename, sep=delimiter, encoding=encoding, compression=compression
- )
- filenames.append(filename)
- return df
-
- # Return function that generates csv files
- yield _make_csv_file
-
- # Delete csv files that were created
- for filename in filenames:
- if os.path.exists(filename):
- try:
- os.remove(filename)
- except PermissionError:
- pass
-
-
-def setup_json_file(row_size, force=False):
- if os.path.exists(TEST_JSON_FILENAME) and not force:
+def setup_json_file(filename, row_size=NROWS, force=True):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_json(TEST_JSON_FILENAME)
+ df.to_json(filename)
-def setup_json_lines_file(row_size, force=False):
- if os.path.exists(TEST_JSON_FILENAME) and not force:
+def setup_json_lines_file(filename, row_size=NROWS, force=True):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_json(TEST_JSON_FILENAME, lines=True, orient="records")
-
-
-def teardown_json_file():
- if os.path.exists(TEST_JSON_FILENAME):
- os.remove(TEST_JSON_FILENAME)
+ df.to_json(filename, lines=True, orient="records")
-def setup_html_file(row_size, force=False):
- if os.path.exists(TEST_HTML_FILENAME) and not force:
+def setup_html_file(filename, row_size=NROWS, force=True):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_html(TEST_HTML_FILENAME)
+ df.to_html(filename)
-def teardown_html_file():
- if os.path.exists(TEST_HTML_FILENAME):
- os.remove(TEST_HTML_FILENAME)
-
-
-def setup_clipboard(row_size, force=False):
+def setup_clipboard(row_size=NROWS):
df = pandas.DataFrame({"col1": np.arange(row_size), "col2": np.arange(row_size)})
df.to_clipboard()
-def setup_excel_file(row_size, force=False):
- if os.path.exists(TEST_EXCEL_FILENAME) and not force:
+def setup_excel_file(filename, row_size=NROWS, force=True):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_excel(TEST_EXCEL_FILENAME)
-
-
-def teardown_excel_file():
- if os.path.exists(TEST_EXCEL_FILENAME):
- try:
- os.remove(TEST_EXCEL_FILENAME)
- except PermissionError:
- pass
+ df.to_excel(filename)
-def setup_feather_file(row_size, force=False):
- if os.path.exists(TEST_FEATHER_FILENAME) and not force:
+def setup_feather_file(filename, row_size=NROWS, force=True):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_feather(TEST_FEATHER_FILENAME)
-
+ df.to_feather(filename)
-def teardown_feather_file():
- if os.path.exists(TEST_FEATHER_FILENAME):
- os.remove(TEST_FEATHER_FILENAME)
-
-def setup_hdf_file(row_size, force=False, format=None):
- if os.path.exists(TEST_READ_HDF_FILENAME) and not force:
+def setup_hdf_file(filename, row_size=NROWS, force=True, format=None):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_hdf(TEST_READ_HDF_FILENAME, key="df", format=format)
-
+ df.to_hdf(filename, key="df", format=format)
-def teardown_hdf_file():
- if os.path.exists(TEST_READ_HDF_FILENAME):
- os.remove(TEST_READ_HDF_FILENAME)
-
-def setup_stata_file(row_size, force=False):
- if os.path.exists(TEST_STATA_FILENAME) and not force:
+def setup_stata_file(filename, row_size=NROWS, force=True):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_stata(TEST_STATA_FILENAME)
-
+ df.to_stata(filename)
-def teardown_stata_file():
- if os.path.exists(TEST_STATA_FILENAME):
- os.remove(TEST_STATA_FILENAME)
-
-def setup_pickle_file(row_size, force=False):
- if os.path.exists(TEST_PICKLE_FILENAME) and not force:
+def setup_pickle_file(filename, row_size=NROWS, force=True):
+ if os.path.exists(filename) and not force:
pass
else:
df = pandas.DataFrame(
{"col1": np.arange(row_size), "col2": np.arange(row_size)}
)
- df.to_pickle(TEST_PICKLE_FILENAME)
-
+ df.to_pickle(filename)
-def teardown_pickle_file():
- if os.path.exists(TEST_PICKLE_FILENAME):
- os.remove(TEST_PICKLE_FILENAME)
-
-@pytest.fixture
-def make_sql_connection():
- """Sets up sql connections and takes them down after the caller is done.
-
- Yields:
- Factory that generates sql connection objects
- """
- filenames = []
-
- def _sql_connection(filename, table=""):
- # Remove file if exists
- if os.path.exists(filename):
- os.remove(filename)
- filenames.append(filename)
- # Create connection and, if needed, table
- conn = "sqlite:///{}".format(filename)
- if table:
- df = pandas.DataFrame(
- {
- "col1": [0, 1, 2, 3, 4, 5, 6],
- "col2": [7, 8, 9, 10, 11, 12, 13],
- "col3": [14, 15, 16, 17, 18, 19, 20],
- "col4": [21, 22, 23, 24, 25, 26, 27],
- "col5": [0, 0, 0, 0, 0, 0, 0],
- }
- )
- df.to_sql(table, conn)
- return conn
-
- yield _sql_connection
-
- # Takedown the fixture
- for filename in filenames:
- if os.path.exists(filename):
- os.remove(filename)
-
-
-def setup_fwf_file(overwrite=False, fwf_data=None):
- if not overwrite and os.path.exists(TEST_FWF_FILENAME):
+def setup_fwf_file(filename, force=True, fwf_data=None):
+ if not force and os.path.exists(filename):
return
if fwf_data is None:
@@ -411,1315 +193,1682 @@ def setup_fwf_file(overwrite=False, fwf_data=None):
ACW000116041979TAVG -618 k -632 k 35 k 474 k 993 k 1566 k 1484 k 1483 k 1229 k 647 k 412 k -40 k
ACW000116041980TAVG -340 k -500 k -35 k 524 k 1071 k 1534 k 1655 k 1502 k 1269 k 660 k 138 k 125 k"""
- with open(TEST_FWF_FILENAME, "w") as f:
+ with open(filename, "w") as f:
f.write(fwf_data)
-def teardown_fwf_file():
- if os.path.exists(TEST_FWF_FILENAME):
- try:
- os.remove(TEST_FWF_FILENAME)
- except PermissionError:
- pass
-
-
-def test_from_parquet(make_parquet_file):
- make_parquet_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
- modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
- df_equals(modin_df, pandas_df)
-
-
-def test_from_parquet_with_columns(make_parquet_file):
- make_parquet_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
- modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
- df_equals(modin_df, pandas_df)
+def eval_to_file(modin_obj, pandas_obj, fn, extension, **fn_kwargs):
+ """Helper function to test `to_` methods.
+ Args:
+ modin_obj: Modin DataFrame or Series to test `to_` method.
+ pandas_obj: Pandas DataFrame or Series to test `to_` method.
+ fn: name of the method, that should be tested.
+ extension: Extension of the test file.
+ """
+ unique_filename_modin = get_unique_filename(extension=extension)
+ unique_filename_pandas = get_unique_filename(extension=extension)
-def test_from_parquet_partition(make_parquet_file):
- make_parquet_file(SMALL_ROW_SIZE, directory=True)
+ try:
+ getattr(modin_obj, fn)(unique_filename_modin, **fn_kwargs)
+ getattr(pandas_obj, fn)(unique_filename_pandas, **fn_kwargs)
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
- modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
- df_equals(modin_df, pandas_df)
+ assert assert_files_eq(unique_filename_modin, unique_filename_pandas)
+ finally:
+ teardown_test_files([unique_filename_modin, unique_filename_pandas])
-def test_from_parquet_partition_with_columns(make_parquet_file):
- make_parquet_file(SMALL_ROW_SIZE, directory=True)
+@pytest.mark.usefixtures("TestReadCSVFixture")
+@pytest.mark.skipif(
+ IsExperimental.get() and Backend.get() == "Pyarrow",
+ reason="Segmentation fault; see PR #2347 ffor details",
+)
+class TestCsv:
+ # delimiter tests
+ @pytest.mark.parametrize("sep", [None, "_", ",", ".", "\n"])
+ @pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"])
+ @pytest.mark.parametrize("decimal", [".", "_"])
+ @pytest.mark.parametrize("thousands", [None, ",", "_", " "])
+ def test_read_csv_delimiters(
+ self, make_csv_file, sep, delimiter, decimal, thousands
+ ):
+ unique_filename = get_unique_filename()
+ make_csv_file(
+ filename=unique_filename,
+ delimiter=delimiter,
+ thousands_separator=thousands,
+ decimal_separator=decimal,
+ )
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
- modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
- df_equals(modin_df, pandas_df)
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer=unique_filename,
+ delimiter=delimiter,
+ sep=sep,
+ decimal=decimal,
+ thousands=thousands,
+ )
+ # Column and Index Locations and Names tests
+ @pytest.mark.skipif(
+ Engine.get() != "Python",
+ reason="many parameters combiantions fails: issue #2312, #2307",
+ )
+ @pytest.mark.parametrize("header", ["infer", None, 0])
+ @pytest.mark.parametrize("index_col", [None, "col1"])
+ @pytest.mark.parametrize("prefix", [None, "_", "col"])
+ @pytest.mark.parametrize(
+ "names", [None, ["col1"], ["c1", "c2", "c3", "c4", "c5", "c6", "c7"]]
+ )
+ @pytest.mark.parametrize(
+ "usecols", [None, ["col1"], ["col1", "col2", "col6"], [0, 1, 5]]
+ )
+ @pytest.mark.parametrize("skip_blank_lines", [True, False])
+ def test_read_csv_col_handling(
+ self,
+ request,
+ header,
+ index_col,
+ prefix,
+ names,
+ usecols,
+ skip_blank_lines,
+ ):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"],
+ header=header,
+ index_col=index_col,
+ prefix=prefix,
+ names=names,
+ usecols=usecols,
+ skip_blank_lines=skip_blank_lines,
+ )
-def test_from_parquet_partitioned_columns(make_parquet_file):
- make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])
+ # General Parsing Configuration
+ @pytest.mark.parametrize("dtype", [None, True])
+ @pytest.mark.parametrize("engine", [None, "python", "c"])
+ @pytest.mark.parametrize(
+ "converters",
+ [
+ None,
+ {
+ "col1": lambda x: np.int64(x) * 10,
+ "col2": pandas.to_datetime,
+ "col4": lambda x: x.replace(":", ";"),
+ },
+ ],
+ )
+ @pytest.mark.parametrize("skipfooter", [0, 10])
+ def test_read_csv_parsing_1(
+ self,
+ request,
+ dtype,
+ engine,
+ converters,
+ skipfooter,
+ ):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
- modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
- df_equals(modin_df, pandas_df)
+ if dtype:
+ dtype = {
+ col: "object"
+ for col in pandas.read_csv(
+ pytest.csvs_names["test_read_csv_regular"], nrows=1
+ ).columns
+ }
+
+ eval_io(
+ fn_name="read_csv",
+ check_exception_type=None, # issue #2320
+ raising_exceptions=None,
+ check_kwargs_callable=not callable(converters),
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
+ dtype=dtype,
+ engine=engine,
+ converters=converters,
+ skipfooter=skipfooter,
+ )
+ @pytest.mark.parametrize("true_values", [["Yes"], ["Yes", "true"], None])
+ @pytest.mark.parametrize("false_values", [["No"], ["No", "false"], None])
+ @pytest.mark.parametrize("skiprows", [2, lambda x: x % 2])
+ @pytest.mark.parametrize("skipfooter", [0, 10])
+ @pytest.mark.parametrize("nrows", [35, None])
+ @pytest.mark.parametrize("names", [["c1", "c2", "c3", "c4"], None])
+ def test_read_csv_parsing_2(
+ self,
+ request,
+ true_values,
+ false_values,
+ skiprows,
+ skipfooter,
+ nrows,
+ names,
+ ):
+ if false_values or true_values and Engine.get() != "Python":
+ pytest.xfail("modin and pandas dataframes differs - issue #2446")
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
-def test_from_parquet_partitioned_columns_with_columns(make_parquet_file):
- make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])
+ eval_io(
+ fn_name="read_csv",
+ check_exception_type=None, # issue #2320
+ raising_exceptions=None,
+ check_kwargs_callable=not callable(skiprows),
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"],
+ true_values=true_values,
+ false_values=false_values,
+ skiprows=skiprows,
+ skipfooter=skipfooter,
+ nrows=nrows,
+ names=names,
+ )
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
- modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
- df_equals(modin_df, pandas_df)
+ def test_read_csv_skipinitialspace(self):
+ unique_filename = get_unique_filename()
+ str_initial_spaces = (
+ "col1,col2,col3,col4\n"
+ "five, six, seven, eight\n"
+ " five, six, seven, eight\n"
+ "five, six, seven, eight\n"
+ )
+ eval_io_from_str(str_initial_spaces, unique_filename, skipinitialspace=True)
-def test_from_parquet_pandas_index():
- # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
- pandas_df = pandas.DataFrame(
- {
- "idx": np.random.randint(0, 100_000, size=2000),
- "A": np.random.randint(0, 100_000, size=2000),
- "B": ["a", "b"] * 1000,
- "C": ["c"] * 2000,
- }
+ @pytest.mark.xfail(reason="infinite recursion error - issue #2032")
+ @pytest.mark.parametrize(
+ "test_case", ["single_element", "single_column", "multiple_columns"]
)
- filepath = "tmp.parquet"
- pandas_df.set_index("idx").to_parquet(filepath)
- # read the same parquet using modin.pandas
- df_equals(pd.read_parquet(filepath), pandas.read_parquet(filepath))
-
- pandas_df.set_index(["idx", "A"]).to_parquet(filepath)
- df_equals(pd.read_parquet(filepath), pandas.read_parquet(filepath))
- os.remove(filepath)
-
-
-def test_from_parquet_pandas_index_partitioned():
- # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
- pandas_df = pandas.DataFrame(
- {
- "idx": np.random.randint(0, 100_000, size=2000),
- "A": np.random.randint(0, 10, size=2000),
- "B": ["a", "b"] * 1000,
- "C": ["c"] * 2000,
+ def test_read_csv_squeeze(self, test_case):
+ unique_filename = get_unique_filename()
+
+ str_single_element = "1"
+ str_single_col = "1\n2\n3\n"
+ str_four_cols = "1, 2, 3, 4\n5, 6, 7, 8\n9, 10, 11, 12\n"
+ case_to_data = {
+ "single_element": str_single_element,
+ "single_column": str_single_col,
+ "multiple_columns": str_four_cols,
}
- )
- filepath = "tmp_folder.parquet"
- pandas_df.set_index("idx").to_parquet(filepath, partition_cols=["A"])
- # read the same parquet using modin.pandas
- df_equals(pd.read_parquet(filepath), pandas.read_parquet(filepath))
- shutil.rmtree(filepath)
-
-
-def test_from_parquet_hdfs():
- path = "modin/pandas/test/data/hdfs.parquet"
- pandas_df = pandas.read_parquet(path)
- modin_df = pd.read_parquet(path)
- df_equals(modin_df, pandas_df)
-
-
-def test_from_json():
- setup_json_file(SMALL_ROW_SIZE)
- pandas_df = pandas.read_json(TEST_JSON_FILENAME)
- modin_df = pd.read_json(TEST_JSON_FILENAME)
-
- df_equals(modin_df, pandas_df)
-
- teardown_json_file()
+ eval_io_from_str(case_to_data[test_case], unique_filename, squeeze=True)
+ eval_io_from_str(
+ case_to_data[test_case], unique_filename, header=None, squeeze=True
+ )
+ def test_read_csv_mangle_dupe_cols(self):
+ unique_filename = get_unique_filename()
+ str_non_unique_cols = "col,col,col,col\n5, 6, 7, 8\n9, 10, 11, 12\n"
+ eval_io_from_str(str_non_unique_cols, unique_filename, mangle_dupe_cols=True)
+
+ # NA and Missing Data Handling tests
+ @pytest.mark.parametrize("na_values", ["custom_nan", "73"])
+ @pytest.mark.parametrize("keep_default_na", [True, False])
+ @pytest.mark.parametrize("na_filter", [True, False])
+ @pytest.mark.parametrize("verbose", [True, False])
+ @pytest.mark.parametrize("skip_blank_lines", [True, False])
+ def test_read_csv_nans_handling(
+ self,
+ na_values,
+ keep_default_na,
+ na_filter,
+ verbose,
+ skip_blank_lines,
+ ):
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"],
+ na_values=na_values,
+ keep_default_na=keep_default_na,
+ na_filter=na_filter,
+ verbose=verbose,
+ skip_blank_lines=skip_blank_lines,
+ )
-def test_from_json_categories():
- pandas_df = pandas.read_json(
- "modin/pandas/test/data/test_categories.json",
- dtype={"one": "int64", "two": "category"},
+ # Datetime Handling tests
+ @pytest.mark.parametrize(
+ "parse_dates",
+ [
+ True,
+ False,
+ ["col2"],
+ ["col2", "col4"],
+ [1, 3],
+ pytest.param(
+ {"foo": ["col2", "col4"]},
+ marks=pytest.mark.xfail(
+ Engine.get() != "Python",
+ reason="Exception: Internal Error - issue #2073",
+ ),
+ ),
+ ],
)
- modin_df = pd.read_json(
- "modin/pandas/test/data/test_categories.json",
- dtype={"one": "int64", "two": "category"},
+ @pytest.mark.parametrize("infer_datetime_format", [True, False])
+ @pytest.mark.parametrize("keep_date_col", [True, False])
+ @pytest.mark.parametrize(
+ "date_parser", [None, lambda x: pandas.datetime.strptime(x, "%Y-%m-%d")]
)
- df_equals(modin_df, pandas_df)
-
-
-def test_from_json_lines():
- setup_json_lines_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_json(TEST_JSON_FILENAME, lines=True)
- modin_df = pd.read_json(TEST_JSON_FILENAME, lines=True)
- df_equals(modin_df, pandas_df)
-
- teardown_json_file()
-
-
-@pytest.mark.parametrize(
- "data",
- [json_short_string, json_short_bytes, json_long_string, json_long_bytes],
-)
-def test_read_json_string_bytes(data):
- with pytest.warns(UserWarning):
- modin_df = pd.read_json(data)
- # For I/O objects we need to rewind to reuse the same object.
- if hasattr(data, "seek"):
- data.seek(0)
- df_equals(modin_df, pandas.read_json(data))
-
-
-def test_from_html():
- setup_html_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_html(TEST_HTML_FILENAME)[0]
- modin_df = pd.read_html(TEST_HTML_FILENAME)
-
- df_equals(modin_df, pandas_df)
-
- teardown_html_file()
-
-
-@pytest.mark.skip(reason="No clipboard on Travis")
-def test_from_clipboard():
- setup_clipboard(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_clipboard()
- modin_df = pd.read_clipboard()
-
- df_equals(modin_df, pandas_df)
-
-
-@pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details")
-def test_from_excel():
- setup_excel_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME)
- modin_df = pd.read_excel(TEST_EXCEL_FILENAME)
-
- df_equals(modin_df, pandas_df)
-
- teardown_excel_file()
-
-
-def test_from_excel_engine():
- setup_excel_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, engine="xlrd")
- with pytest.warns(UserWarning):
- modin_df = pd.read_excel(TEST_EXCEL_FILENAME, engine="xlrd")
-
- df_equals(modin_df, pandas_df)
-
- teardown_excel_file()
-
-
-def test_from_excel_index_col():
- setup_excel_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, index_col=0)
- with pytest.warns(UserWarning):
- modin_df = pd.read_excel(TEST_EXCEL_FILENAME, index_col=0)
-
- df_equals(modin_df, pandas_df)
-
- teardown_excel_file()
+ @pytest.mark.parametrize("dayfirst", [True, False])
+ @pytest.mark.parametrize("cache_dates", [True, False])
+ def test_read_csv_datetime(
+ self,
+ request,
+ parse_dates,
+ infer_datetime_format,
+ keep_date_col,
+ date_parser,
+ dayfirst,
+ cache_dates,
+ ):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ raising_exceptions = io_ops_bad_exc # default value
+ if isinstance(parse_dates, dict) and callable(date_parser):
+ # In this case raised TypeError: () takes 1 positional argument but 2 were given
+ raising_exceptions = list(io_ops_bad_exc)
+ raising_exceptions.remove(TypeError)
+
+ eval_io(
+ fn_name="read_csv",
+ check_kwargs_callable=not callable(date_parser),
+ raising_exceptions=raising_exceptions,
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
+ parse_dates=parse_dates,
+ infer_datetime_format=infer_datetime_format,
+ keep_date_col=keep_date_col,
+ date_parser=date_parser,
+ dayfirst=dayfirst,
+ cache_dates=cache_dates,
+ )
-def test_from_excel_all_sheets():
- setup_excel_file(SMALL_ROW_SIZE)
+ # Iteration tests
+ @pytest.mark.parametrize("iterator", [True, False])
+ def test_read_csv_iteration(self, iterator):
+ filename = pytest.csvs_names["test_read_csv_regular"]
- pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME, sheet_name=None)
- modin_df = pd.read_excel(TEST_EXCEL_FILENAME, sheet_name=None)
+ # Tests __next__ and correctness of reader as an iterator
+ # Use larger chunksize to read through file quicker
+ rdf_reader = pd.read_csv(filename, chunksize=500, iterator=iterator)
+ pd_reader = pandas.read_csv(filename, chunksize=500, iterator=iterator)
- assert isinstance(pandas_df, (OrderedDict, dict))
- assert isinstance(modin_df, type(pandas_df))
+ for modin_df, pd_df in zip(rdf_reader, pd_reader):
+ df_equals(modin_df, pd_df)
- assert pandas_df.keys() == modin_df.keys()
+ # Tests that get_chunk works correctly
+ rdf_reader = pd.read_csv(filename, chunksize=1, iterator=iterator)
+ pd_reader = pandas.read_csv(filename, chunksize=1, iterator=iterator)
- for key in pandas_df.keys():
- df_equals(modin_df.get(key), pandas_df.get(key))
+ modin_df = rdf_reader.get_chunk(1)
+ pd_df = pd_reader.get_chunk(1)
- teardown_excel_file()
+ df_equals(modin_df, pd_df)
+ # Tests that read works correctly
+ rdf_reader = pd.read_csv(filename, chunksize=1, iterator=iterator)
+ pd_reader = pandas.read_csv(filename, chunksize=1, iterator=iterator)
-@pytest.mark.parametrize(
- "sheet_name",
- ["Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3],
-)
-def test_from_excel_sheet_name(sheet_name):
- fname = "modin/pandas/test/data/modin_error_book.xlsx"
- modin_df = pd.read_excel(fname, sheet_name=sheet_name)
- pandas_df = pandas.read_excel(fname, sheet_name=sheet_name)
- df_equals(modin_df, pandas_df)
+ modin_df = rdf_reader.read()
+ pd_df = pd_reader.read()
+ df_equals(modin_df, pd_df)
-# @pytest.mark.skip(reason="Arrow version mismatch between Pandas and Feather")
-def test_from_feather():
- setup_feather_file(SMALL_ROW_SIZE)
+ def test_read_csv_encoding_976(self):
+ file_name = "modin/pandas/test/data/issue_976.csv"
+ names = [str(i) for i in range(11)]
- pandas_df = pandas.read_feather(TEST_FEATHER_FILENAME)
- modin_df = pd.read_feather(TEST_FEATHER_FILENAME)
+ kwargs = {
+ "sep": ";",
+ "names": names,
+ "encoding": "windows-1251",
+ }
+ df1 = pd.read_csv(file_name, **kwargs)
+ df2 = pandas.read_csv(file_name, **kwargs)
+ # these columns contain data of various types in partitions
+ # see #1931 for details;
+ df1 = df1.drop(["4", "5"], axis=1)
+ df2 = df2.drop(["4", "5"], axis=1)
+
+ df_equals(df1, df2)
+
+ # Quoting, Compression, and File Format parameters tests
+ @pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"])
+ @pytest.mark.parametrize(
+ "encoding",
+ [None, "latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"],
+ )
+ @pytest.mark.parametrize("engine", [None, "python", "c"])
+ def test_read_csv_compression(self, make_csv_file, compression, encoding, engine):
+ unique_filename = get_unique_filename()
+ make_csv_file(
+ filename=unique_filename, encoding=encoding, compression=compression
+ )
+ compressed_file_path = (
+ f"{unique_filename}.{COMP_TO_EXT[compression]}"
+ if compression != "infer"
+ else unique_filename
+ )
- df_equals(modin_df, pandas_df)
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer=compressed_file_path,
+ compression=compression,
+ encoding=encoding,
+ engine=engine,
+ )
- teardown_feather_file()
+ @pytest.mark.parametrize("thousands", [None, ",", "_", " "])
+ @pytest.mark.parametrize("decimal", [".", "_"])
+ @pytest.mark.parametrize("lineterminator", [None, "x", "\n"])
+ @pytest.mark.parametrize("escapechar", [None, "d", "x"])
+ @pytest.mark.parametrize("dialect", ["test_csv_dialect", None])
+ def test_read_csv_file_format(
+ self,
+ request,
+ make_csv_file,
+ thousands,
+ decimal,
+ lineterminator,
+ escapechar,
+ dialect,
+ ):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ elif Engine.get() != "Python" and lineterminator == "x":
+ pytest.xfail("read_csv with Ray engine outputs empty frame - issue #2493")
+ elif Engine.get() != "Python" and escapechar:
+ pytest.xfail(
+ "read_csv with Ray engine fails with some 'escapechar' parameters - issue #2494"
+ )
+ elif Engine.get() != "Python" and dialect:
+ pytest.xfail(
+ "read_csv with Ray engine fails with `dialect` parameter - issue #2508"
+ )
+ unique_filename = get_unique_filename()
+ if dialect:
+ test_csv_dialect_params = {
+ "delimiter": "_",
+ "doublequote": False,
+ "escapechar": "\\",
+ "quotechar": "d",
+ "quoting": csv.QUOTE_ALL,
+ }
+ csv.register_dialect(dialect, **test_csv_dialect_params)
+ dialect = csv.get_dialect(dialect)
+ make_csv_file(filename=unique_filename, **test_csv_dialect_params)
+ else:
+ make_csv_file(
+ filename=unique_filename,
+ thousands_separator=thousands,
+ decimal_separator=decimal,
+ escapechar=escapechar,
+ line_terminator=lineterminator,
+ )
-@pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
-def test_from_hdf():
- setup_hdf_file(SMALL_ROW_SIZE, format=None)
+ eval_io(
+ check_exception_type=None, # issue #2320
+ raising_exceptions=None,
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer=unique_filename,
+ thousands=thousands,
+ decimal=decimal,
+ lineterminator=lineterminator,
+ escapechar=escapechar,
+ dialect=dialect,
+ )
- pandas_df = pandas.read_hdf(TEST_READ_HDF_FILENAME, key="df")
- modin_df = pd.read_hdf(TEST_READ_HDF_FILENAME, key="df")
+ @pytest.mark.parametrize(
+ "quoting",
+ [csv.QUOTE_ALL, csv.QUOTE_MINIMAL, csv.QUOTE_NONNUMERIC, csv.QUOTE_NONE],
+ )
+ @pytest.mark.parametrize("quotechar", ['"', "_", "d"])
+ @pytest.mark.parametrize("doublequote", [True, False])
+ @pytest.mark.parametrize("comment", [None, "#", "x"])
+ def test_read_csv_quoting(
+ self,
+ make_csv_file,
+ quoting,
+ quotechar,
+ doublequote,
+ comment,
+ ):
+ # in these cases escapechar should be set, otherwise error occures
+ # _csv.Error: need to escape, but no escapechar set"
+ use_escapechar = (
+ not doublequote and quotechar != '"' and quoting != csv.QUOTE_NONE
+ )
+ escapechar = "\\" if use_escapechar else None
+ unique_filename = get_unique_filename()
+
+ make_csv_file(
+ filename=unique_filename,
+ quoting=quoting,
+ quotechar=quotechar,
+ doublequote=doublequote,
+ escapechar=escapechar,
+ comment_col_char=comment,
+ )
- df_equals(modin_df, pandas_df)
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer=unique_filename,
+ quoting=quoting,
+ quotechar=quotechar,
+ doublequote=doublequote,
+ escapechar=escapechar,
+ comment=comment,
+ )
- teardown_hdf_file()
+ # Error Handling parameters tests
+ @pytest.mark.xfail(
+ Engine.get() != "Python",
+ reason="read_csv with Ray engine doen't raise `bad lines` exceptions - issue #2500",
+ )
+ @pytest.mark.parametrize("warn_bad_lines", [True, False])
+ @pytest.mark.parametrize("error_bad_lines", [True, False])
+ def test_read_csv_error_handling(
+ self,
+ warn_bad_lines,
+ error_bad_lines,
+ ):
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_bad_lines"],
+ warn_bad_lines=warn_bad_lines,
+ error_bad_lines=error_bad_lines,
+ )
+ # Internal parameters tests
+ @pytest.mark.parametrize("use_str_data", [True, False])
+ @pytest.mark.parametrize("engine", [None, "python", "c"])
+ @pytest.mark.parametrize("delimiter", [",", " "])
+ @pytest.mark.parametrize("delim_whitespace", [True, False])
+ @pytest.mark.parametrize("low_memory", [True, False])
+ @pytest.mark.parametrize("memory_map", [True, False])
+ @pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
+ def test_read_csv_internal(
+ self,
+ make_csv_file,
+ use_str_data,
+ engine,
+ delimiter,
+ delim_whitespace,
+ low_memory,
+ memory_map,
+ float_precision,
+ ):
+ if Engine.get() != "Python" and delimiter == " ":
+ pytest.xfail(
+ "read_csv with Ray engine doesn't \
+ raise exceptions while Pandas raises - issue #2320"
+ )
-@pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
-def test_from_hdf_format():
- setup_hdf_file(SMALL_ROW_SIZE, format="table")
+ # In this case raised TypeError: cannot use a string pattern on a bytes-like object,
+ # so TypeError should be excluded from raising_exceptions list in order to check, that
+ # the same exceptions are raised by Pandas and Modin
+ case_with_TypeError_exc = (
+ engine == "python"
+ and delimiter == ","
+ and delim_whitespace
+ and low_memory
+ and memory_map
+ and float_precision is None
+ )
- pandas_df = pandas.read_hdf(TEST_READ_HDF_FILENAME, key="df")
- modin_df = pd.read_hdf(TEST_READ_HDF_FILENAME, key="df")
+ raising_exceptions = io_ops_bad_exc # default value
+ if case_with_TypeError_exc:
+ raising_exceptions = list(io_ops_bad_exc)
+ raising_exceptions.remove(TypeError)
+
+ kwargs = {
+ "engine": engine,
+ "delimiter": delimiter,
+ "delim_whitespace": delim_whitespace,
+ "low_memory": low_memory,
+ "memory_map": memory_map,
+ "float_precision": float_precision,
+ }
- df_equals(modin_df, pandas_df)
+ unique_filename = get_unique_filename()
- teardown_hdf_file()
+ if use_str_data:
+ str_delim_whitespaces = (
+ "col1 col2 col3 col4\n5 6 7 8\n9 10 11 12\n"
+ )
+ eval_io_from_str(
+ str_delim_whitespaces,
+ unique_filename,
+ raising_exceptions=raising_exceptions,
+ **kwargs,
+ )
+ else:
+ make_csv_file(
+ filename=unique_filename,
+ delimiter=delimiter,
+ )
+ eval_io(
+ filepath_or_buffer=unique_filename,
+ fn_name="read_csv",
+ raising_exceptions=raising_exceptions,
+ **kwargs,
+ )
-def test_from_stata():
- setup_stata_file(SMALL_ROW_SIZE)
+ # Issue related, specific or corner cases
+ @pytest.mark.parametrize("nrows", [2, None])
+ def test_read_csv_bad_quotes(self, nrows):
+ csv_bad_quotes = (
+ '1, 2, 3, 4\none, two, three, four\nfive, "six", seven, "eight\n'
+ )
- pandas_df = pandas.read_stata(TEST_STATA_FILENAME)
- modin_df = pd.read_stata(TEST_STATA_FILENAME)
+ unique_filename = get_unique_filename()
- df_equals(modin_df, pandas_df)
+ eval_io_from_str(csv_bad_quotes, unique_filename, nrows=nrows)
- teardown_stata_file()
+ def test_read_csv_categories(self, request):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="modin/pandas/test/data/test_categories.csv",
+ names=["one", "two"],
+ dtype={"one": "int64", "two": "category"},
+ )
+ @pytest.mark.parametrize("encoding", [None, "utf-8"])
+ @pytest.mark.parametrize("parse_dates", [False, ["timestamp"]])
+ @pytest.mark.parametrize("index_col", [None, 0, 2])
+ @pytest.mark.parametrize("header", ["infer", 0])
+ @pytest.mark.parametrize(
+ "names",
+ [
+ None,
+ ["timestamp", "symbol", "high", "low", "open", "close", "spread", "volume"],
+ ],
+ )
+ def test_read_csv_parse_dates(
+ self, request, names, header, index_col, parse_dates, encoding
+ ):
+ if (
+ parse_dates
+ and request.config.getoption("--simulate-cloud").lower() != "off"
+ ):
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
-def test_from_pickle():
- setup_pickle_file(SMALL_ROW_SIZE)
+ if names is not None and header == "infer":
+ pytest.xfail(
+ "read_csv with Ray engine works incorrectly with date data and names parameter provided - issue #2509"
+ )
- pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME)
- modin_df = pd.read_pickle(TEST_PICKLE_FILENAME)
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="modin/pandas/test/data/test_time_parsing.csv",
+ names=names,
+ header=header,
+ index_col=index_col,
+ parse_dates=parse_dates,
+ encoding=encoding,
+ )
- df_equals(modin_df, pandas_df)
+ @pytest.mark.skipif(Engine.get() == "Python", reason="Using pandas implementation")
+ def test_read_csv_s3(self):
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="s3://noaa-ghcn-pds/csv/1788.csv",
+ )
- teardown_pickle_file()
+ @pytest.mark.parametrize("names", [list("XYZ"), None])
+ @pytest.mark.parametrize("skiprows", [1, 2, 3, 4, None])
+ def test_read_csv_skiprows_names(self, names, skiprows):
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="modin/pandas/test/data/issue_2239.csv",
+ names=names,
+ skiprows=skiprows,
+ )
-def test_from_sql(make_sql_connection):
- filename = "test_from_sql.db"
- table = "test_from_sql"
- conn = make_sql_connection(filename, table)
- query = "select * from {0}".format(table)
+ def test_read_csv_default_to_pandas(self, request):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ with pytest.warns(UserWarning):
+ # This tests that we default to pandas on a buffer
+ from io import StringIO
- pandas_df = pandas.read_sql(query, conn)
- modin_df = pd.read_sql(query, conn)
+ pd.read_csv(
+ StringIO(open(pytest.csvs_names["test_read_csv_regular"], "r").read())
+ )
- df_equals(modin_df, pandas_df)
+ with pytest.warns(UserWarning):
+ pd.read_csv(
+ pytest.csvs_names["test_read_csv_regular"],
+ skiprows=lambda x: x in [0, 2],
+ )
- pandas_df = pandas.read_sql(query, conn, index_col="index")
- modin_df = pd.read_sql(query, conn, index_col="index")
+ def test_read_csv_default_to_pandas_url(self, request):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ # We haven't implemented read_csv from https, but if it's implemented, then this needs to change
+ eval_io(
+ fn_name="read_csv",
+ modin_warning=UserWarning,
+ # read_csv kwargs
+ filepath_or_buffer="https://raw.githubusercontent.com/modin-project/modin/master/modin/pandas/test/data/blah.csv",
+ )
- df_equals(modin_df, pandas_df)
+ @pytest.mark.parametrize("nrows", [21, 5, None])
+ @pytest.mark.parametrize("skiprows", [4, 1, 500, None])
+ def test_read_csv_newlines_in_quotes(self, nrows, skiprows):
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="modin/pandas/test/data/newlines.csv",
+ nrows=nrows,
+ skiprows=skiprows,
+ cast_to_str=True,
+ )
- with pytest.warns(UserWarning):
- pd.read_sql_query(query, conn)
+ def test_read_csv_sep_none(self, request):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ eval_io(
+ fn_name="read_csv",
+ modin_warning=ParserWarning,
+ # read_csv kwargs
+ filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
+ sep=None,
+ )
- with pytest.warns(UserWarning):
- pd.read_sql_table(table, conn)
+ def test_read_csv_incorrect_data(self, request):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="modin/pandas/test/data/test_categories.json",
+ )
- # Test SQLAlchemy engine
- conn = sa.create_engine(conn)
- pandas_df = pandas.read_sql(query, conn)
- modin_df = pd.read_sql(query, conn)
+ @pytest.mark.parametrize(
+ "kwargs",
+ [
+ {"names": [5, 1, 3, 4, 2, 6]},
+ {"names": [0]},
+ {"names": None, "usecols": [1, 0, 2]},
+ {"names": [3, 1, 2, 5], "usecols": [4, 1, 3, 2]},
+ ],
+ )
+ def test_read_csv_names_neq_num_cols(self, request, kwargs):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="modin/pandas/test/data/issue_2074.csv",
+ **kwargs,
+ )
- df_equals(modin_df, pandas_df)
+ def test_dataframe_to_csv(self, request):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ pandas_df = pandas.read_csv(pytest.csvs_names["test_read_csv_regular"])
+ modin_df = pd.DataFrame(pandas_df)
+ eval_to_file(
+ modin_obj=modin_df, pandas_obj=pandas_df, fn="to_csv", extension="csv"
+ )
- # Test SQLAlchemy Connection
- conn = conn.connect()
- pandas_df = pandas.read_sql(query, conn)
- modin_df = pd.read_sql(query, conn)
+ def test_series_to_csv(self, request):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip(
+ "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
+ )
+ pandas_s = pandas.read_csv(
+ pytest.csvs_names["test_read_csv_regular"], usecols=["col1"]
+ ).squeeze()
+ modin_s = pd.Series(pandas_s)
+ eval_to_file(
+ modin_obj=modin_s, pandas_obj=pandas_s, fn="to_csv", extension="csv"
+ )
- df_equals(modin_df, pandas_df)
+ def test_read_csv_within_decorator(self):
+ @dummy_decorator()
+ def wrapped_read_csv(file, method):
+ if method == "pandas":
+ return pandas.read_csv(file)
+ if method == "modin":
+ return pd.read_csv(file)
-def test_from_sql_with_chunksize(make_sql_connection):
- filename = "test_from_sql.db"
- table = "test_from_sql"
- conn = make_sql_connection(filename, table)
- query = "select * from {0}".format(table)
+ pandas_df = wrapped_read_csv(
+ pytest.csvs_names["test_read_csv_regular"], method="pandas"
+ )
+ modin_df = wrapped_read_csv(
+ pytest.csvs_names["test_read_csv_regular"], method="modin"
+ )
- pandas_gen = pandas.read_sql(query, conn, chunksize=10)
- modin_gen = pd.read_sql(query, conn, chunksize=10)
- for modin_df, pandas_df in zip(modin_gen, pandas_gen):
df_equals(modin_df, pandas_df)
+ @pytest.mark.parametrize("read_mode", ["r", "rb"])
+ def test_read_csv_file_handle(self, request, read_mode, make_csv_file):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip("Cannot pickle file handles. See comments in PR #2625")
-@pytest.mark.skip(reason="No SAS write methods in Pandas")
-def test_from_sas():
- pandas_df = pandas.read_sas(TEST_SAS_FILENAME)
- modin_df = pd.read_sas(TEST_SAS_FILENAME)
-
- df_equals(modin_df, pandas_df)
-
-
-@pytest.mark.parametrize("nrows", [123, None])
-def test_from_csv(make_csv_file, nrows):
- make_csv_file()
-
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, nrows=nrows)
- modin_df = pd.read_csv(TEST_CSV_FILENAME, nrows=nrows)
-
- df_equals(modin_df, pandas_df)
+ unique_filename = get_unique_filename()
+ make_csv_file(filename=unique_filename)
- pandas_df = pandas.read_csv(Path(TEST_CSV_FILENAME), nrows=nrows)
- modin_df = pd.read_csv(Path(TEST_CSV_FILENAME), nrows=nrows)
+ try:
+ with open(unique_filename, mode=read_mode) as buffer:
+ df_pandas = pandas.read_csv(buffer)
+ buffer.seek(0)
+ df_modin = pd.read_csv(buffer)
+ df_equals(df_modin, df_pandas)
+ finally:
+ teardown_test_files([unique_filename])
+
+
+class TestTable:
+ def test_read_table(self, make_csv_file):
+ unique_filename = get_unique_filename()
+ make_csv_file(filename=unique_filename, delimiter="\t")
+ eval_io(
+ fn_name="read_table",
+ # read_table kwargs
+ filepath_or_buffer=unique_filename,
+ )
- df_equals(modin_df, pandas_df)
+ def test_read_table_within_decorator(self, make_csv_file):
+ unique_filename = get_unique_filename()
+ make_csv_file(filename=unique_filename, delimiter="\t")
+ @dummy_decorator()
+ def wrapped_read_table(file, method):
+ if method == "pandas":
+ return pandas.read_table(file)
-@pytest.mark.parametrize("nrows", [123, None])
-def test_from_csv_sep_none(make_csv_file, nrows):
- make_csv_file()
+ if method == "modin":
+ return pd.read_table(file)
- with pytest.warns(ParserWarning):
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, sep=None, nrows=nrows)
- with pytest.warns(ParserWarning):
- modin_df = pd.read_csv(TEST_CSV_FILENAME, sep=None, nrows=nrows)
- df_equals(modin_df, pandas_df)
+ pandas_df = wrapped_read_table(unique_filename, method="pandas")
+ modin_df = wrapped_read_table(unique_filename, method="modin")
+ df_equals(modin_df, pandas_df)
-@pytest.mark.parametrize("nrows", [2, None])
-def test_from_csv_bad_quotes(nrows):
- csv_bad_quotes = """1, 2, 3, 4
-one, two, three, four
-five, "six", seven, "eight
-"""
- with open(TEST_CSV_FILENAME, "w") as f:
- f.write(csv_bad_quotes)
+class TestParquet:
+ @pytest.mark.parametrize("columns", [None, ["col1"]])
+ def test_read_parquet(self, make_parquet_file, columns):
+ unique_filename = get_unique_filename(extension="parquet")
+ make_parquet_file(filename=unique_filename)
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, nrows=nrows)
- modin_df = pd.read_csv(TEST_CSV_FILENAME, nrows=nrows)
+ eval_io(
+ fn_name="read_parquet",
+ # read_parquet kwargs
+ path=unique_filename,
+ columns=columns,
+ )
- df_equals(modin_df, pandas_df)
+ @pytest.mark.parametrize("columns", [None, ["col1"]])
+ def test_read_parquet_directory(self, make_parquet_file, columns): #
+ unique_filename = get_unique_filename(extension=None)
+ make_parquet_file(filename=unique_filename, directory=True)
+ eval_io(
+ fn_name="read_parquet",
+ # read_parquet kwargs
+ path=unique_filename,
+ columns=columns,
+ )
-@pytest.mark.parametrize("nrows", [2, None])
-def test_from_csv_quote_none(nrows):
- csv_bad_quotes = """1, 2, 3, 4
-one, two, three, four
-five, "six", seven, "eight
-"""
- with open(TEST_CSV_FILENAME, "w") as f:
- f.write(csv_bad_quotes)
+ @pytest.mark.parametrize("columns", [None, ["col1"]])
+ def test_read_parquet_partitioned_directory(self, make_parquet_file, columns):
+ unique_filename = get_unique_filename(extension=None)
+ make_parquet_file(filename=unique_filename, partitioned_columns=["col1"])
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, quoting=csv.QUOTE_NONE, nrows=nrows)
- modin_df = pd.read_csv(TEST_CSV_FILENAME, quoting=csv.QUOTE_NONE, nrows=nrows)
+ eval_io(
+ fn_name="read_parquet",
+ # read_parquet kwargs
+ path=unique_filename,
+ columns=columns,
+ )
- df_equals(modin_df, pandas_df)
+ def test_read_parquet_pandas_index(self):
+ # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
+ unique_filename = get_unique_filename(extension="parquet")
+ pandas_df = pandas.DataFrame(
+ {
+ "idx": np.random.randint(0, 100_000, size=2000),
+ "A": np.random.randint(0, 100_000, size=2000),
+ "B": ["a", "b"] * 1000,
+ "C": ["c"] * 2000,
+ }
+ )
+ try:
+ pandas_df.set_index("idx").to_parquet(unique_filename)
+ # read the same parquet using modin.pandas
+ df_equals(
+ pd.read_parquet(unique_filename), pandas.read_parquet(unique_filename)
+ )
+ pandas_df.set_index(["idx", "A"]).to_parquet(unique_filename)
+ df_equals(
+ pd.read_parquet(unique_filename), pandas.read_parquet(unique_filename)
+ )
+ finally:
+ os.remove(unique_filename)
+
+ def test_read_parquet_pandas_index_partitioned(self):
+ # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
+ unique_filename = get_unique_filename(extension="parquet")
+ pandas_df = pandas.DataFrame(
+ {
+ "idx": np.random.randint(0, 100_000, size=2000),
+ "A": np.random.randint(0, 10, size=2000),
+ "B": ["a", "b"] * 1000,
+ "C": ["c"] * 2000,
+ }
+ )
+ try:
+ pandas_df.set_index("idx").to_parquet(unique_filename, partition_cols=["A"])
+ # read the same parquet using modin.pandas
+ df_equals(
+ pd.read_parquet(unique_filename), pandas.read_parquet(unique_filename)
+ )
+ finally:
+ shutil.rmtree(unique_filename)
+
+ def test_read_parquet_hdfs(self):
+ eval_io(
+ fn_name="read_parquet",
+ # read_parquet kwargs
+ path="modin/pandas/test/data/hdfs.parquet",
+ )
-def test_from_csv_categories():
- pandas_df = pandas.read_csv(
- "modin/pandas/test/data/test_categories.csv",
- names=["one", "two"],
- dtype={"one": "int64", "two": "category"},
+ @pytest.mark.skipif(
+ Engine.get() == "Python",
+ reason="S3-like path doesn't support in pandas with anonymous credentials. See issue #2301.",
)
- modin_df = pd.read_csv(
- "modin/pandas/test/data/test_categories.csv",
- names=["one", "two"],
- dtype={"one": "int64", "two": "category"},
- )
- df_equals(modin_df, pandas_df)
-
-
-def test_from_csv_gzip(make_csv_file):
- make_csv_file(compression="gzip")
- gzip_path = "{}.gz".format(TEST_CSV_FILENAME)
-
- pandas_df = pandas.read_csv(gzip_path)
- modin_df = pd.read_csv(gzip_path)
- df_equals(modin_df, pandas_df)
-
- pandas_df = pandas.read_csv(gzip_path, compression="gzip")
- modin_df = pd.read_csv(gzip_path, compression="gzip")
- df_equals(modin_df, pandas_df)
-
-
-def test_from_csv_bz2(make_csv_file):
- make_csv_file(compression="bz2")
- bz2_path = "{}.bz2".format(TEST_CSV_FILENAME)
-
- pandas_df = pandas.read_csv(bz2_path)
- modin_df = pd.read_csv(bz2_path)
- df_equals(modin_df, pandas_df)
-
- pandas_df = pandas.read_csv(bz2_path, compression="bz2")
- modin_df = pd.read_csv(bz2_path, compression="bz2")
- df_equals(modin_df, pandas_df)
+ def test_read_parquet_s3(self):
+ import s3fs
+
+ # Pandas currently supports only default credentials for boto therefore
+ # we use S3FileSystem with `anon=True` for to make testing possible.
+ dataset_url = "s3://aws-roda-hcls-datalake/chembl_27/chembl_27_public_tissue_dictionary/part-00000-66508102-96fa-4fd9-a0fd-5bc072a74293-c000.snappy.parquet"
+ fs = s3fs.S3FileSystem(anon=True)
+ pandas_df = pandas.read_parquet(fs.open(dataset_url, "rb"))
+ modin_df_s3fs = pd.read_parquet(fs.open(dataset_url, "rb"))
+ df_equals(pandas_df, modin_df_s3fs)
+
+ # Modin supports default and anonymous credentials and resolves this internally.
+ modin_df_s3 = pd.read_parquet(dataset_url)
+ df_equals(pandas_df, modin_df_s3)
+
+ def test_to_parquet(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+ eval_to_file(
+ modin_obj=modin_df,
+ pandas_obj=pandas_df,
+ fn="to_parquet",
+ extension="parquet",
+ )
-def test_from_csv_xz(make_csv_file):
- make_csv_file(compression="xz")
- xz_path = "{}.xz".format(TEST_CSV_FILENAME)
+class TestJson:
+ @pytest.mark.parametrize("lines", [False, True])
+ def test_read_json(self, lines):
+ unique_filename = get_unique_filename(extension="json")
+ try:
+ setup_json_file(filename=unique_filename)
+ eval_io(
+ fn_name="read_json",
+ # read_json kwargs
+ path_or_buf=unique_filename,
+ lines=lines,
+ )
+ finally:
+ teardown_test_files([unique_filename])
+
+ def test_read_json_categories(self):
+ eval_io(
+ fn_name="read_json",
+ # read_json kwargs
+ path_or_buf="modin/pandas/test/data/test_categories.json",
+ dtype={"one": "int64", "two": "category"},
+ )
- pandas_df = pandas.read_csv(xz_path)
- modin_df = pd.read_csv(xz_path)
- df_equals(modin_df, pandas_df)
+ @pytest.mark.parametrize(
+ "data",
+ [json_short_string, json_short_bytes, json_long_string, json_long_bytes],
+ )
+ def test_read_json_string_bytes(self, data):
+ with pytest.warns(UserWarning):
+ modin_df = pd.read_json(data)
+ # For I/O objects we need to rewind to reuse the same object.
+ if hasattr(data, "seek"):
+ data.seek(0)
+ df_equals(modin_df, pandas.read_json(data))
+
+ def test_to_json(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+ eval_to_file(
+ modin_obj=modin_df, pandas_obj=pandas_df, fn="to_json", extension="json"
+ )
- pandas_df = pandas.read_csv(xz_path, compression="xz")
- modin_df = pd.read_csv(xz_path, compression="xz")
- df_equals(modin_df, pandas_df)
+ @pytest.mark.parametrize("read_mode", ["r", "rb"])
+ def test_read_json_file_handle(self, request, read_mode):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip("Cannot pickle file handles. See comments in PR #2625")
+ unique_filename = get_unique_filename(extension="json")
+ try:
+ setup_json_file(filename=unique_filename)
+ with open(unique_filename, mode=read_mode) as buf:
+ df_pandas = pandas.read_json(buf)
+ buf.seek(0)
+ df_modin = pd.read_json(buf)
+ df_equals(df_pandas, df_modin)
+ finally:
+ teardown_test_files([unique_filename])
+
+
+class TestExcel:
+ @pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details")
+ @check_file_leaks
+ def test_read_excel(self):
+ unique_filename = get_unique_filename(extension="xlsx")
+ try:
+ setup_excel_file(filename=unique_filename)
+ eval_io(
+ fn_name="read_excel",
+ # read_excel kwargs
+ io=unique_filename,
+ )
+ finally:
+ teardown_test_files([unique_filename])
+ @check_file_leaks
+ def test_read_excel_engine(self):
+ unique_filename = get_unique_filename(extension="xlsx")
+ try:
+ setup_excel_file(filename=unique_filename)
+ eval_io(
+ fn_name="read_excel",
+ modin_warning=UserWarning,
+ # read_excel kwargs
+ io=unique_filename,
+ engine="openpyxl",
+ )
+ finally:
+ teardown_test_files([unique_filename])
-def test_from_csv_zip(make_csv_file):
- make_csv_file(compression="zip")
- zip_path = "{}.zip".format(TEST_CSV_FILENAME)
+ @check_file_leaks
+ def test_read_excel_index_col(self):
+ unique_filename = get_unique_filename(extension="xlsx")
+ try:
+ setup_excel_file(filename=unique_filename)
+
+ eval_io(
+ fn_name="read_excel",
+ modin_warning=UserWarning,
+ # read_excel kwargs
+ io=unique_filename,
+ index_col=0,
+ )
+ finally:
+ teardown_test_files([unique_filename])
- pandas_df = pandas.read_csv(zip_path)
- modin_df = pd.read_csv(zip_path)
- df_equals(modin_df, pandas_df)
+ @check_file_leaks
+ def test_read_excel_all_sheets(self):
+ unique_filename = get_unique_filename(extension="xlsx")
+ try:
+ setup_excel_file(filename=unique_filename)
- pandas_df = pandas.read_csv(zip_path, compression="zip")
- modin_df = pd.read_csv(zip_path, compression="zip")
- df_equals(modin_df, pandas_df)
+ pandas_df = pandas.read_excel(unique_filename, sheet_name=None)
+ modin_df = pd.read_excel(unique_filename, sheet_name=None)
+ assert isinstance(pandas_df, (OrderedDict, dict))
+ assert isinstance(modin_df, type(pandas_df))
-def test_parse_dates_read_csv():
- pandas_df = pandas.read_csv("modin/pandas/test/data/test_time_parsing.csv")
- modin_df = pd.read_csv("modin/pandas/test/data/test_time_parsing.csv")
- df_equals(modin_df, pandas_df)
+ assert pandas_df.keys() == modin_df.keys()
- pandas_df = pandas.read_csv(
- "modin/pandas/test/data/test_time_parsing.csv",
- names=[
- "timestamp",
- "symbol",
- "high",
- "low",
- "open",
- "close",
- "spread",
- "volume",
- ],
- header=0,
- index_col=0,
- encoding="utf-8",
- )
- modin_df = pd.read_csv(
- "modin/pandas/test/data/test_time_parsing.csv",
- names=[
- "timestamp",
- "symbol",
- "high",
- "low",
- "open",
- "close",
- "spread",
- "volume",
- ],
- header=0,
- index_col=0,
- encoding="utf-8",
- )
- df_equals(modin_df, pandas_df)
+ for key in pandas_df.keys():
+ df_equals(modin_df.get(key), pandas_df.get(key))
+ finally:
+ teardown_test_files([unique_filename])
- pandas_df = pandas.read_csv(
- "modin/pandas/test/data/test_time_parsing.csv",
- names=[
- "timestamp",
- "symbol",
- "high",
- "low",
- "open",
- "close",
- "spread",
- "volume",
- ],
- header=0,
- index_col=0,
- parse_dates=["timestamp"],
- encoding="utf-8",
- )
- modin_df = pd.read_csv(
- "modin/pandas/test/data/test_time_parsing.csv",
- names=[
- "timestamp",
- "symbol",
- "high",
- "low",
- "open",
- "close",
- "spread",
- "volume",
- ],
- header=0,
- index_col=0,
- parse_dates=["timestamp"],
- encoding="utf-8",
+ @pytest.mark.xfail(
+ reason="pandas throws the exception. See pandas issue #39250 for more info"
)
- df_equals(modin_df, pandas_df)
+ @check_file_leaks
+ def test_read_excel_sheetname_title(self):
+ eval_io(
+ fn_name="read_excel",
+ # read_excel kwargs
+ io="modin/pandas/test/data/excel_sheetname_title.xlsx",
+ )
- pandas_df = pandas.read_csv(
- "modin/pandas/test/data/test_time_parsing.csv",
- names=[
- "timestamp",
- "symbol",
- "high",
- "low",
- "open",
- "close",
- "spread",
- "volume",
- ],
- header=0,
- index_col=2,
- parse_dates=["timestamp"],
- encoding="utf-8",
- )
- modin_df = pd.read_csv(
- "modin/pandas/test/data/test_time_parsing.csv",
- names=[
- "timestamp",
- "symbol",
- "high",
- "low",
- "open",
- "close",
- "spread",
- "volume",
+ @check_file_leaks
+ def test_excel_empty_line(self):
+ path = "modin/pandas/test/data/test_emptyline.xlsx"
+ modin_df = pd.read_excel(path)
+ assert str(modin_df)
+
+ @pytest.mark.parametrize(
+ "sheet_name",
+ [
+ "Sheet1",
+ "AnotherSpecialName",
+ "SpecialName",
+ "SecondSpecialName",
+ 0,
+ 1,
+ 2,
+ 3,
],
- header=0,
- index_col=2,
- parse_dates=["timestamp"],
- encoding="utf-8",
)
- df_equals(modin_df, pandas_df)
-
-
-@pytest.mark.parametrize(
- "kwargs",
- [
- {"header": None, "usecols": [0, 7]},
- {"usecols": [0, 7]},
- {"names": [0, 7], "usecols": [0, 7]},
- ],
-)
-def test_from_csv_with_args(kwargs):
- file_name = "modin/pandas/test/data/issue_621.csv"
- pandas_df = pandas.read_csv(file_name, **kwargs)
- modin_df = pd.read_csv(file_name, **kwargs)
- df_equals(modin_df, pandas_df)
-
-
-def test_from_table(make_csv_file):
- make_csv_file(delimiter="\t")
-
- pandas_df = pandas.read_table(TEST_CSV_FILENAME)
- modin_df = pd.read_table(TEST_CSV_FILENAME)
-
- df_equals(modin_df, pandas_df)
-
- pandas_df = pandas.read_table(Path(TEST_CSV_FILENAME))
- modin_df = pd.read_table(Path(TEST_CSV_FILENAME))
+ @check_file_leaks
+ def test_read_excel_sheet_name(self, sheet_name):
+ eval_io(
+ fn_name="read_excel",
+ # read_excel kwargs
+ io="modin/pandas/test/data/modin_error_book.xlsx",
+ sheet_name=sheet_name,
+ )
- df_equals(modin_df, pandas_df)
+ def test_ExcelFile(self):
+ unique_filename = get_unique_filename(extension="xlsx")
+ try:
+ setup_excel_file(filename=unique_filename)
+ modin_excel_file = pd.ExcelFile(unique_filename)
+ pandas_excel_file = pandas.ExcelFile(unique_filename)
-@pytest.mark.parametrize("usecols", [["a"], ["a", "b", "e"], [0, 1, 4]])
-def test_from_csv_with_usecols(usecols):
- fname = "modin/pandas/test/data/test_usecols.csv"
- pandas_df = pandas.read_csv(fname, usecols=usecols)
- modin_df = pd.read_csv(fname, usecols=usecols)
- df_equals(modin_df, pandas_df)
+ df_equals(modin_excel_file.parse(), pandas_excel_file.parse())
+ assert modin_excel_file.io == unique_filename
+ assert isinstance(modin_excel_file, pd.ExcelFile)
+ modin_excel_file.close()
+ pandas_excel_file.close()
+ finally:
+ teardown_test_files([unique_filename])
-@pytest.mark.skipif(Engine.get() == "Python", reason="Using pandas implementation")
-def test_from_csv_s3(make_csv_file):
- dataset_url = "s3://noaa-ghcn-pds/csv/1788.csv"
- pandas_df = pandas.read_csv(dataset_url)
+ @pytest.mark.xfail(strict=False, reason="Flaky test, defaults to pandas")
+ def test_to_excel(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
- # This first load is to trigger all the import deprecation warnings
- modin_df = pd.read_csv(dataset_url)
+ unique_filename_modin = get_unique_filename(extension="xlsx")
+ unique_filename_pandas = get_unique_filename(extension="xlsx")
- # This will warn if it defaults to pandas behavior, but it shouldn't
- with pytest.warns(None) as record:
- modin_df = pd.read_csv(dataset_url)
+ modin_writer = pandas.ExcelWriter(unique_filename_modin)
+ pandas_writer = pandas.ExcelWriter(unique_filename_pandas)
+ try:
+ modin_df.to_excel(modin_writer)
+ pandas_df.to_excel(pandas_writer)
- assert not any(
- "defaulting to pandas implementation" in str(err) for err in record.list
- )
+ modin_writer.save()
+ pandas_writer.save()
- df_equals(modin_df, pandas_df)
+ assert assert_files_eq(unique_filename_modin, unique_filename_pandas)
+ finally:
+ teardown_test_files([unique_filename_modin, unique_filename_pandas])
-def test_from_csv_default(make_csv_file):
- # We haven't implemented read_csv from https, but if it's implemented, then this needs to change
- dataset_url = "https://raw.githubusercontent.com/modin-project/modin/master/modin/pandas/test/data/blah.csv"
- pandas_df = pandas.read_csv(dataset_url)
+class TestHdf:
+ @pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
+ @pytest.mark.parametrize("format", [None, "table"])
+ def test_read_hdf(self, format):
+ unique_filename = get_unique_filename(extension="hdf")
+ try:
+ setup_hdf_file(filename=unique_filename, format=format)
+ eval_io(
+ fn_name="read_hdf",
+ # read_hdf kwargs
+ path_or_buf=unique_filename,
+ key="df",
+ )
+ finally:
+ teardown_test_files([unique_filename])
- with pytest.warns(UserWarning):
- modin_df = pd.read_csv(dataset_url)
+ @pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
+ def test_HDFStore(self):
+ try:
+ unique_filename_modin = get_unique_filename(extension="hdf")
+ unique_filename_pandas = get_unique_filename(extension="hdf")
+ modin_store = pd.HDFStore(unique_filename_modin)
+ pandas_store = pandas.HDFStore(unique_filename_pandas)
+
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+
+ modin_store["foo"] = modin_df
+ pandas_store["foo"] = pandas_df
+
+ assert assert_files_eq(unique_filename_modin, unique_filename_pandas)
+ modin_df = modin_store.get("foo")
+ pandas_df = pandas_store.get("foo")
+ df_equals(modin_df, pandas_df)
+
+ assert isinstance(modin_store, pd.HDFStore)
+
+ handle, hdf_file = tempfile.mkstemp(suffix=".hdf5", prefix="test_read")
+ os.close(handle)
+ with pd.HDFStore(hdf_file, mode="w") as store:
+ store.append("data/df1", pd.DataFrame(np.random.randn(5, 5)))
+ store.append("data/df2", pd.DataFrame(np.random.randn(4, 4)))
+
+ modin_df = pd.read_hdf(hdf_file, key="data/df1", mode="r")
+ pandas_df = pandas.read_hdf(hdf_file, key="data/df1", mode="r")
+ df_equals(modin_df, pandas_df)
+ finally:
+ os.unlink(hdf_file)
+ teardown_test_files([unique_filename_modin, unique_filename_pandas])
+
+
+class TestSql:
+ def test_read_sql(self, make_sql_connection):
+ filename = get_unique_filename(extension="db")
+ table = "test_read_sql"
+ conn = make_sql_connection(filename, table)
+ query = f"select * from {table}"
+
+ eval_io(
+ fn_name="read_sql",
+ # read_sql kwargs
+ sql=query,
+ con=conn,
+ )
- df_equals(modin_df, pandas_df)
+ eval_io(
+ fn_name="read_sql",
+ # read_sql kwargs
+ sql=query,
+ con=conn,
+ index_col="index",
+ )
+ with pytest.warns(UserWarning):
+ pd.read_sql_query(query, conn)
-def test_from_csv_chunksize(make_csv_file):
- make_csv_file()
+ with pytest.warns(UserWarning):
+ pd.read_sql_table(table, conn)
- # Tests __next__ and correctness of reader as an iterator
- # Use larger chunksize to read through file quicker
- rdf_reader = pd.read_csv(TEST_CSV_FILENAME, chunksize=500)
- pd_reader = pandas.read_csv(TEST_CSV_FILENAME, chunksize=500)
+ # Test SQLAlchemy engine
+ conn = sa.create_engine(conn)
+ eval_io(
+ fn_name="read_sql",
+ # read_sql kwargs
+ sql=query,
+ con=conn,
+ )
- for modin_df, pd_df in zip(rdf_reader, pd_reader):
- df_equals(modin_df, pd_df)
+ # Test SQLAlchemy Connection
+ conn = conn.connect()
+ eval_io(
+ fn_name="read_sql",
+ # read_sql kwargs
+ sql=query,
+ con=conn,
+ )
- # Tests that get_chunk works correctly
- rdf_reader = pd.read_csv(TEST_CSV_FILENAME, chunksize=1)
- pd_reader = pandas.read_csv(TEST_CSV_FILENAME, chunksize=1)
+ def test_read_sql_with_chunksize(self, make_sql_connection):
+ filename = get_unique_filename(extension="db")
+ table = "test_read_sql_with_chunksize"
+ conn = make_sql_connection(filename, table)
+ query = f"select * from {table}"
+
+ pandas_gen = pandas.read_sql(query, conn, chunksize=10)
+ modin_gen = pd.read_sql(query, conn, chunksize=10)
+ for modin_df, pandas_df in zip(modin_gen, pandas_gen):
+ df_equals(modin_df, pandas_df)
+
+ @pytest.mark.parametrize("index", [False, True])
+ def test_to_sql(self, make_sql_connection, index):
+ table_name = f"test_to_sql_{str(index)}"
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+
+ # We do not pass the table name so the fixture won't generate a table
+ conn = make_sql_connection(f"{table_name}_modin.db")
+ modin_df.to_sql(table_name, conn, index=index)
+ df_modin_sql = pandas.read_sql(
+ table_name, con=conn, index_col="index" if index else None
+ )
- modin_df = rdf_reader.get_chunk(1)
- pd_df = pd_reader.get_chunk(1)
+ # We do not pass the table name so the fixture won't generate a table
+ conn = make_sql_connection(f"{table_name}_pandas.db")
+ pandas_df.to_sql(table_name, conn, index=index)
+ df_pandas_sql = pandas.read_sql(
+ table_name, con=conn, index_col="index" if index else None
+ )
- df_equals(modin_df, pd_df)
+ assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())
- # Tests that read works correctly
- rdf_reader = pd.read_csv(TEST_CSV_FILENAME, chunksize=1)
- pd_reader = pandas.read_csv(TEST_CSV_FILENAME, chunksize=1)
- modin_df = rdf_reader.read()
- pd_df = pd_reader.read()
+class TestHtml:
+ @pytest.mark.xfail(reason="read_html is not yet implemented properly - issue #1296")
+ def test_read_html(self):
+ unique_filename = get_unique_filename(extension="html")
+ try:
+ setup_html_file(filename=unique_filename)
+ eval_io(fn_name="read_html", io=unique_filename)
+ finally:
+ teardown_test_files([unique_filename])
- df_equals(modin_df, pd_df)
+ def test_to_html(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+ eval_to_file(
+ modin_obj=modin_df, pandas_obj=pandas_df, fn="to_html", extension="html"
+ )
-@pytest.mark.parametrize("nrows", [123, None])
-def test_from_csv_skiprows(make_csv_file, nrows):
- make_csv_file()
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, skiprows=2, nrows=nrows)
- modin_df = pd.read_csv(TEST_CSV_FILENAME, skiprows=2, nrows=nrows)
- df_equals(modin_df, pandas_df)
+class TestFwf:
+ def test_fwf_file(self):
+ fwf_data = (
+ "id8141 360.242940 149.910199 11950.7\n"
+ "id1594 444.953632 166.985655 11788.4\n"
+ "id1849 364.136849 183.628767 11806.2\n"
+ "id1230 413.836124 184.375703 11916.8\n"
+ "id1948 502.953953 173.237159 12468.3\n"
+ )
- pandas_df = pandas.read_csv(
- TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=2, nrows=nrows
- )
- modin_df = pd.read_csv(
- TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=2, nrows=nrows
- )
- df_equals(modin_df, pandas_df)
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename, fwf_data=fwf_data)
- pandas_df = pandas.read_csv(
- TEST_CSV_FILENAME,
- names=["c1", "c2", "c3", "c4"],
- skiprows=lambda x: x % 2,
- nrows=nrows,
- )
- modin_df = pd.read_csv(
- TEST_CSV_FILENAME,
- names=["c1", "c2", "c3", "c4"],
- skiprows=lambda x: x % 2,
- nrows=nrows,
+ colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
+ df = pd.read_fwf(
+ unique_filename, colspecs=colspecs, header=None, index_col=0
+ )
+ assert isinstance(df, pd.DataFrame)
+ finally:
+ teardown_test_files([unique_filename])
+
+ @pytest.mark.parametrize(
+ "kwargs",
+ [
+ {
+ "colspecs": [
+ (0, 11),
+ (11, 15),
+ (19, 24),
+ (27, 32),
+ (35, 40),
+ (43, 48),
+ (51, 56),
+ (59, 64),
+ (67, 72),
+ (75, 80),
+ (83, 88),
+ (91, 96),
+ (99, 104),
+ (107, 112),
+ ],
+ "names": ["stationID", "year", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+ "na_values": ["-9999"],
+ "index_col": ["stationID", "year"],
+ },
+ {
+ "widths": [20, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8],
+ "names": ["id", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+ "index_col": [0],
+ },
+ ],
)
- df_equals(modin_df, pandas_df)
-
-
-@pytest.mark.parametrize(
- "encoding", ["latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"]
-)
-def test_from_csv_encoding(make_csv_file, encoding):
- make_csv_file(encoding=encoding)
-
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, encoding=encoding)
- modin_df = pd.read_csv(TEST_CSV_FILENAME, encoding=encoding)
-
- df_equals(modin_df, pandas_df)
-
+ def test_fwf_file_colspecs_widths(self, kwargs):
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename)
+
+ modin_df = pd.read_fwf(unique_filename, **kwargs)
+ pandas_df = pd.read_fwf(unique_filename, **kwargs)
+
+ df_equals(modin_df, pandas_df)
+ finally:
+ teardown_test_files([unique_filename])
+
+ @pytest.mark.parametrize("usecols", [["a"], ["a", "b", "d"], [0, 1, 3]])
+ def test_fwf_file_usecols(self, usecols):
+ fwf_data = (
+ "a b c d\n"
+ "id8141 360.242940 149.910199 11950.7\n"
+ "id1594 444.953632 166.985655 11788.4\n"
+ "id1849 364.136849 183.628767 11806.2\n"
+ "id1230 413.836124 184.375703 11916.8\n"
+ "id1948 502.953953 173.237159 12468.3\n"
+ )
-def test_from_csv_default_to_pandas_behavior(make_csv_file):
- make_csv_file()
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename, fwf_data=fwf_data)
- with pytest.warns(UserWarning):
- # This tests that we default to pandas on a buffer
- from io import StringIO
+ eval_io(
+ fn_name="read_fwf",
+ # read_fwf kwargs
+ filepath_or_buffer=unique_filename,
+ usecols=usecols,
+ )
+ finally:
+ teardown_test_files([unique_filename])
- pd.read_csv(StringIO(open(TEST_CSV_FILENAME, "r").read()))
+ def test_fwf_file_chunksize(self):
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename)
- with pytest.warns(UserWarning):
- pd.read_csv(TEST_CSV_FILENAME, skiprows=lambda x: x in [0, 2])
+ # Tests __next__ and correctness of reader as an iterator
+ rdf_reader = pd.read_fwf(unique_filename, chunksize=5)
+ pd_reader = pandas.read_fwf(unique_filename, chunksize=5)
+ for modin_df, pd_df in zip(rdf_reader, pd_reader):
+ df_equals(modin_df, pd_df)
-@pytest.mark.parametrize("nrows", [123, None])
-def test_from_csv_index_col(make_csv_file, nrows):
- make_csv_file()
+ # Tests that get_chunk works correctly
+ rdf_reader = pd.read_fwf(unique_filename, chunksize=1)
+ pd_reader = pandas.read_fwf(unique_filename, chunksize=1)
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, index_col="col1", nrows=nrows)
- modin_df = pd.read_csv(TEST_CSV_FILENAME, index_col="col1", nrows=nrows)
- df_equals(modin_df, pandas_df)
+ modin_df = rdf_reader.get_chunk(1)
+ pd_df = pd_reader.get_chunk(1)
+ df_equals(modin_df, pd_df)
-def test_from_csv_skipfooter(make_csv_file):
- make_csv_file()
+ # Tests that read works correctly
+ rdf_reader = pd.read_fwf(unique_filename, chunksize=1)
+ pd_reader = pandas.read_fwf(unique_filename, chunksize=1)
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, skipfooter=13)
- modin_df = pd.read_csv(TEST_CSV_FILENAME, skipfooter=13)
+ modin_df = rdf_reader.read()
+ pd_df = pd_reader.read()
- df_equals(modin_df, pandas_df)
+ df_equals(modin_df, pd_df)
+ finally:
+ teardown_test_files([unique_filename])
+ @pytest.mark.parametrize("nrows", [13, None])
+ def test_fwf_file_skiprows(self, nrows):
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename)
+
+ eval_io(
+ fn_name="read_fwf",
+ # read_fwf kwargs
+ filepath_or_buffer=unique_filename,
+ skiprows=2,
+ nrows=nrows,
+ )
-def test_from_csv_parse_dates(make_csv_file):
- make_csv_file(force=True)
+ eval_io(
+ fn_name="read_fwf",
+ # read_fwf kwargs
+ filepath_or_buffer=unique_filename,
+ usecols=[0, 4, 7],
+ skiprows=[2, 5],
+ nrows=nrows,
+ )
+ finally:
+ teardown_test_files([unique_filename])
+
+ def test_fwf_file_index_col(self):
+ fwf_data = (
+ "a b c d\n"
+ "id8141 360.242940 149.910199 11950.7\n"
+ "id1594 444.953632 166.985655 11788.4\n"
+ "id1849 364.136849 183.628767 11806.2\n"
+ "id1230 413.836124 184.375703 11916.8\n"
+ "id1948 502.953953 173.237159 12468.3\n"
+ )
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME, parse_dates=[["col2", "col4"]])
- modin_df = pd.read_csv(TEST_CSV_FILENAME, parse_dates=[["col2", "col4"]])
- df_equals(modin_df, pandas_df)
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename, fwf_data=fwf_data)
+ eval_io(
+ fn_name="read_fwf",
+ # read_fwf kwargs
+ filepath_or_buffer=unique_filename,
+ index_col="c",
+ )
+ finally:
+ teardown_test_files([unique_filename])
- pandas_df = pandas.read_csv(
- TEST_CSV_FILENAME, parse_dates={"time": ["col2", "col4"]}
- )
- modin_df = pd.read_csv(TEST_CSV_FILENAME, parse_dates={"time": ["col2", "col4"]})
- df_equals(modin_df, pandas_df)
+ def test_fwf_file_skipfooter(self):
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename)
+ eval_io(
+ fn_name="read_fwf",
+ # read_fwf kwargs
+ filepath_or_buffer=unique_filename,
+ skipfooter=2,
+ )
+ finally:
+ teardown_test_files([unique_filename])
+
+ def test_fwf_file_parse_dates(self):
+ dates = pandas.date_range("2000", freq="h", periods=10)
+ fwf_data = "col1 col2 col3 col4"
+ for i in range(10, 20):
+ fwf_data = fwf_data + "\n{col1} {col2} {col3} {col4}".format(
+ col1=str(i),
+ col2=str(dates[i - 10].date()),
+ col3=str(i),
+ col4=str(dates[i - 10].time()),
+ )
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename, fwf_data=fwf_data)
-@pytest.mark.parametrize("nrows", [21, 5, None])
-@pytest.mark.parametrize("skiprows", [4, 1, 500, None])
-def test_from_csv_newlines_in_quotes(nrows, skiprows):
- eval_io(
- path="modin/pandas/test/data/newlines.csv",
- fn_name="read_csv",
- nrows=nrows,
- skiprows=skiprows,
- cast_to_str=True,
- )
+ eval_io(
+ fn_name="read_fwf",
+ # read_fwf kwargs
+ filepath_or_buffer=unique_filename,
+ parse_dates=[["col2", "col4"]],
+ )
+ eval_io(
+ fn_name="read_fwf",
+ # read_fwf kwargs
+ filepath_or_buffer=unique_filename,
+ parse_dates={"time": ["col2", "col4"]},
+ )
+ finally:
+ teardown_test_files([unique_filename])
+
+ @pytest.mark.parametrize("read_mode", ["r", "rb"])
+ def test_read_fwf_file_handle(self, request, read_mode):
+ if request.config.getoption("--simulate-cloud").lower() != "off":
+ pytest.skip("Cannot pickle file handles. See comments in PR #2625")
+ unique_filename = get_unique_filename(extension="txt")
+ try:
+ setup_fwf_file(filename=unique_filename)
+
+ with open(unique_filename, mode=read_mode) as buffer:
+ df_pandas = pandas.read_fwf(buffer)
+ buffer.seek(0)
+ df_modin = pd.read_fwf(buffer)
+ df_equals(df_modin, df_pandas)
+ finally:
+ teardown_test_files([unique_filename])
+
+
+class TestGbq:
+ @pytest.mark.skip(reason="Need to verify GBQ access")
+ def test_read_gbq(self):
+ # Test API, but do not supply credentials until credits can be secured.
+ with pytest.raises(
+ ValueError, match="Could not determine project ID and one was not supplied."
+ ):
+ pd.read_gbq("SELECT 1")
+
+ @pytest.mark.skip(reason="Need to verify GBQ access")
+ def test_to_gbq(self):
+ modin_df, _ = create_test_dfs(TEST_DATA)
+ # Test API, but do not supply credentials until credits can be secured.
+ with pytest.raises(
+ ValueError, match="Could not determine project ID and one was not supplied."
+ ):
+ modin_df.to_gbq("modin.table")
+
+
+class TestStata:
+ def test_read_stata(self):
+ unique_filename = get_unique_filename(extension="stata")
+ try:
+ setup_stata_file(filename=unique_filename)
+ eval_io(
+ fn_name="read_stata",
+ # read_stata kwargs
+ filepath_or_buffer=unique_filename,
+ )
+ finally:
+ teardown_test_files([unique_filename])
-def test_read_csv_incorrect_data():
- name = "modin/pandas/test/data/test_categories.json"
- pandas_df, modin_df = pandas.read_csv(name), pd.read_csv(name)
+ def test_to_stata(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+ eval_to_file(
+ modin_obj=modin_df, pandas_obj=pandas_df, fn="to_stata", extension="stata"
+ )
- df_equals(pandas_df, modin_df)
+class TestFeather:
+ def test_read_feather(self):
+ unique_filename = get_unique_filename(extension="feather")
+ try:
+ setup_feather_file(filename=unique_filename)
-@pytest.mark.skip(reason="No clipboard on Travis")
-def test_to_clipboard():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
+ eval_io(
+ fn_name="read_feather",
+ # read_feather kwargs
+ path=unique_filename,
+ )
+ finally:
+ teardown_test_files([unique_filename])
+
+ def test_to_feather(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+ eval_to_file(
+ modin_obj=modin_df,
+ pandas_obj=pandas_df,
+ fn="to_feather",
+ extension="feather",
+ )
- modin_df.to_clipboard()
- modin_as_clip = pandas.read_clipboard()
- pandas_df.to_clipboard()
- pandas_as_clip = pandas.read_clipboard()
+class TestClipboard:
+ @pytest.mark.skip(reason="No clipboard on Travis")
+ def test_read_clipboard(self):
+ setup_clipboard()
- assert modin_as_clip.equals(pandas_as_clip)
+ eval_io(fn_name="read_clipboard")
+ @pytest.mark.skip(reason="No clipboard on Travis")
+ def test_to_clipboard(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
-def test_dataframe_to_csv():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
+ modin_df.to_clipboard()
+ modin_as_clip = pandas.read_clipboard()
- TEST_CSV_DF_FILENAME = "test_df.csv"
- TEST_CSV_pandas_FILENAME = "test_pandas.csv"
+ pandas_df.to_clipboard()
+ pandas_as_clip = pandas.read_clipboard()
- modin_df.to_csv(TEST_CSV_DF_FILENAME)
- pandas_df.to_csv(TEST_CSV_pandas_FILENAME)
+ assert modin_as_clip.equals(pandas_as_clip)
- assert assert_files_eq(TEST_CSV_DF_FILENAME, TEST_CSV_pandas_FILENAME)
- teardown_test_file(TEST_CSV_pandas_FILENAME)
- teardown_test_file(TEST_CSV_DF_FILENAME)
+class TestPickle:
+ def test_read_pickle(self):
+ unique_filename = get_unique_filename(extension="pkl")
+ try:
+ setup_pickle_file(filename=unique_filename)
+ eval_io(
+ fn_name="read_pickle",
+ # read_pickle kwargs
+ filepath_or_buffer=unique_filename,
+ )
+ finally:
+ teardown_test_files([unique_filename])
-def test_series_to_csv():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
+ def test_to_pickle(self):
+ modin_df, pandas_df = create_test_dfs(TEST_DATA)
+ eval_to_file(
+ modin_obj=modin_df, pandas_obj=pandas_df, fn="to_pickle", extension="pkl"
+ )
- TEST_CSV_DF_FILENAME = "test_df.csv"
- TEST_CSV_pandas_FILENAME = "test_pandas.csv"
+ unique_filename_modin = get_unique_filename(extension="pkl")
+ unique_filename_pandas = get_unique_filename(extension="pkl")
+ try:
+ pd.to_pickle(modin_df, unique_filename_modin)
+ pandas.to_pickle(pandas_df, unique_filename_pandas)
- modin_s = modin_df["col1"]
- pandas_s = pandas_df["col1"]
- modin_s.to_csv(TEST_CSV_DF_FILENAME)
- pandas_s.to_csv(TEST_CSV_pandas_FILENAME)
+ assert assert_files_eq(unique_filename_modin, unique_filename_pandas)
+ finally:
+ teardown_test_files([unique_filename_modin, unique_filename_pandas])
- df_equals(modin_s, pandas_s)
- assert modin_s.name == pandas_s.name
- assert assert_files_eq(TEST_CSV_DF_FILENAME, TEST_CSV_pandas_FILENAME)
- teardown_test_file(TEST_CSV_pandas_FILENAME)
- teardown_test_file(TEST_CSV_DF_FILENAME)
+def test_from_arrow():
+ _, pandas_df = create_test_dfs(TEST_DATA)
+ modin_df = from_arrow(pa.Table.from_pandas(pandas_df))
+ df_equals(modin_df, pandas_df)
-@pytest.mark.skip(reason="Defaulting to Pandas")
def test_to_dense():
- modin_df = create_test_modin_dataframe()
-
- with pytest.raises(NotImplementedError):
- modin_df.to_dense()
+ modin_df, pandas_df = create_test_dfs({"col1": pandas.SparseArray([0, 1, 0])})
+ df_equals(modin_df.sparse.to_dense(), pandas_df.sparse.to_dense())
def test_to_dict():
- modin_df = create_test_modin_dataframe()
+ modin_df, _ = create_test_dfs(TEST_DATA)
assert modin_df.to_dict() == to_pandas(modin_df).to_dict()
-@pytest.mark.xfail(strict=False, reason="Flaky test, defaults to pandas")
-def test_to_excel():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_EXCEL_DF_FILENAME = "test_df.xlsx"
- TEST_EXCEL_pandas_FILENAME = "test_pandas.xlsx"
-
- modin_writer = pandas.ExcelWriter(TEST_EXCEL_DF_FILENAME)
- pandas_writer = pandas.ExcelWriter(TEST_EXCEL_pandas_FILENAME)
-
- modin_df.to_excel(modin_writer)
- pandas_df.to_excel(pandas_writer)
-
- modin_writer.save()
- pandas_writer.save()
-
- assert assert_files_eq(TEST_EXCEL_DF_FILENAME, TEST_EXCEL_pandas_FILENAME)
-
- teardown_test_file(TEST_EXCEL_DF_FILENAME)
- teardown_test_file(TEST_EXCEL_pandas_FILENAME)
-
-
-def test_to_feather():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_FEATHER_DF_FILENAME = "test_df.feather"
- TEST_FEATHER_pandas_FILENAME = "test_pandas.feather"
-
- modin_df.to_feather(TEST_FEATHER_DF_FILENAME)
- pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME)
-
- assert assert_files_eq(TEST_FEATHER_DF_FILENAME, TEST_FEATHER_pandas_FILENAME)
-
- teardown_test_file(TEST_FEATHER_pandas_FILENAME)
- teardown_test_file(TEST_FEATHER_DF_FILENAME)
-
-
-def test_to_html():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_HTML_DF_FILENAME = "test_df.html"
- TEST_HTML_pandas_FILENAME = "test_pandas.html"
-
- modin_df.to_html(TEST_HTML_DF_FILENAME)
- pandas_df.to_html(TEST_HTML_pandas_FILENAME)
-
- assert assert_files_eq(TEST_HTML_DF_FILENAME, TEST_HTML_pandas_FILENAME)
-
- teardown_test_file(TEST_HTML_pandas_FILENAME)
- teardown_test_file(TEST_HTML_DF_FILENAME)
-
-
-def test_to_json():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_JSON_DF_FILENAME = "test_df.json"
- TEST_JSON_pandas_FILENAME = "test_pandas.json"
-
- modin_df.to_json(TEST_JSON_DF_FILENAME)
- pandas_df.to_json(TEST_JSON_pandas_FILENAME)
-
- assert assert_files_eq(TEST_JSON_DF_FILENAME, TEST_JSON_pandas_FILENAME)
-
- teardown_test_file(TEST_JSON_pandas_FILENAME)
- teardown_test_file(TEST_JSON_DF_FILENAME)
-
-
def test_to_latex():
- modin_df = create_test_modin_dataframe()
+ modin_df, _ = create_test_dfs(TEST_DATA)
assert modin_df.to_latex() == to_pandas(modin_df).to_latex()
-def test_to_parquet():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_PARQUET_DF_FILENAME = "test_df.parquet"
- TEST_PARQUET_pandas_FILENAME = "test_pandas.parquet"
-
- modin_df.to_parquet(TEST_PARQUET_DF_FILENAME)
- pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME)
-
- assert assert_files_eq(TEST_PARQUET_DF_FILENAME, TEST_PARQUET_pandas_FILENAME)
-
- teardown_test_file(TEST_PARQUET_pandas_FILENAME)
- teardown_test_file(TEST_PARQUET_DF_FILENAME)
-
-
-@pytest.mark.skip(reason="Defaulting to Pandas")
def test_to_period():
- modin_df = create_test_modin_dataframe()
-
- with pytest.raises(NotImplementedError):
- modin_df.to_period()
-
-
-def test_to_pickle():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_PICKLE_DF_FILENAME = "test_df.pkl"
- TEST_PICKLE_pandas_FILENAME = "test_pandas.pkl"
-
- modin_df.to_pickle(TEST_PICKLE_DF_FILENAME)
- pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME)
-
- assert assert_files_eq(TEST_PICKLE_DF_FILENAME, TEST_PICKLE_pandas_FILENAME)
-
- teardown_test_file(TEST_PICKLE_pandas_FILENAME)
- teardown_test_file(TEST_PICKLE_DF_FILENAME)
-
- pd.to_pickle(modin_df, TEST_PICKLE_DF_FILENAME)
- pandas.to_pickle(pandas_df, TEST_PICKLE_pandas_FILENAME)
-
- assert assert_files_eq(TEST_PICKLE_DF_FILENAME, TEST_PICKLE_pandas_FILENAME)
-
- teardown_test_file(TEST_PICKLE_pandas_FILENAME)
- teardown_test_file(TEST_PICKLE_DF_FILENAME)
-
-
-def test_to_sql_without_index(make_sql_connection):
- table_name = "tbl_without_index"
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- # We do not pass the table name so the fixture won't generate a table
- conn = make_sql_connection("test_to_sql.db")
- modin_df.to_sql(table_name, conn, index=False)
- df_modin_sql = pandas.read_sql(table_name, con=conn)
-
- # We do not pass the table name so the fixture won't generate a table
- conn = make_sql_connection("test_to_sql_pandas.db")
- pandas_df.to_sql(table_name, conn, index=False)
- df_pandas_sql = pandas.read_sql(table_name, con=conn)
-
- assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())
-
-
-def test_to_sql_with_index(make_sql_connection):
- table_name = "tbl_with_index"
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- # We do not pass the table name so the fixture won't generate a table
- conn = make_sql_connection("test_to_sql.db")
- modin_df.to_sql(table_name, conn)
- df_modin_sql = pandas.read_sql(table_name, con=conn, index_col="index")
-
- # We do not pass the table name so the fixture won't generate a table
- conn = make_sql_connection("test_to_sql_pandas.db")
- pandas_df.to_sql(table_name, conn)
- df_pandas_sql = pandas.read_sql(table_name, con=conn, index_col="index")
-
- assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())
-
-
-def test_to_stata():
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_STATA_DF_FILENAME = "test_df.stata"
- TEST_STATA_pandas_FILENAME = "test_pandas.stata"
-
- modin_df.to_stata(TEST_STATA_DF_FILENAME)
- pandas_df.to_stata(TEST_STATA_pandas_FILENAME)
-
- assert assert_files_eq(TEST_STATA_DF_FILENAME, TEST_STATA_pandas_FILENAME)
-
- teardown_test_file(TEST_STATA_pandas_FILENAME)
- teardown_test_file(TEST_STATA_DF_FILENAME)
-
-
-@pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
-def test_HDFStore():
- modin_store = pd.HDFStore(TEST_WRITE_HDF_FILENAME_MODIN)
- pandas_store = pandas.HDFStore(TEST_WRITE_HDF_FILENAME_PANDAS)
-
- modin_df = create_test_modin_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- modin_store["foo"] = modin_df
- pandas_store["foo"] = pandas_df
-
- assert assert_files_eq(
- TEST_WRITE_HDF_FILENAME_MODIN, TEST_WRITE_HDF_FILENAME_PANDAS
+ index = pandas.DatetimeIndex(
+ pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"]))
)
- modin_df = modin_store.get("foo")
- pandas_df = pandas_store.get("foo")
- df_equals(modin_df, pandas_df)
-
- assert isinstance(modin_store, pd.HDFStore)
-
- hdf_file = "/tmp/test_read_hdf.hdf5"
- with pd.HDFStore(hdf_file, mode="w") as store:
- store.append("data/df1", pd.DataFrame(np.random.randn(5, 5)))
- store.append("data/df2", pd.DataFrame(np.random.randn(4, 4)))
-
- modin_df = pd.read_hdf(hdf_file, key="data/df1", mode="r")
- pandas_df = pandas.read_hdf(hdf_file, key="data/df1", mode="r")
- df_equals(modin_df, pandas_df)
-
-
-def test_ExcelFile():
- setup_excel_file(SMALL_ROW_SIZE)
-
- modin_excel_file = pd.ExcelFile(TEST_EXCEL_FILENAME)
- pandas_excel_file = pandas.ExcelFile(TEST_EXCEL_FILENAME)
-
- df_equals(modin_excel_file.parse(), pandas_excel_file.parse())
-
- assert modin_excel_file.io == TEST_EXCEL_FILENAME
- assert isinstance(modin_excel_file, pd.ExcelFile)
- modin_excel_file.close()
- pandas_excel_file.close()
-
- teardown_excel_file()
-
-
-def test_fwf_file():
- fwf_data = """id8141 360.242940 149.910199 11950.7
-id1594 444.953632 166.985655 11788.4
-id1849 364.136849 183.628767 11806.2
-id1230 413.836124 184.375703 11916.8
-id1948 502.953953 173.237159 12468.3"""
-
- setup_fwf_file(True, fwf_data=fwf_data)
-
- colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
- df = pd.read_fwf(TEST_FWF_FILENAME, colspecs=colspecs, header=None, index_col=0)
- assert isinstance(df, pd.DataFrame)
-
- teardown_fwf_file()
-
-
-@pytest.mark.parametrize(
- "kwargs",
- [
- {
- "colspecs": [
- (0, 11),
- (11, 15),
- (19, 24),
- (27, 32),
- (35, 40),
- (43, 48),
- (51, 56),
- (59, 64),
- (67, 72),
- (75, 80),
- (83, 88),
- (91, 96),
- (99, 104),
- (107, 112),
- ],
- "names": ["stationID", "year", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
- "na_values": ["-9999"],
- "index_col": ["stationID", "year"],
- },
- {
- "widths": [20, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8],
- "names": ["id", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
- "index_col": [0],
- },
- ],
-)
-def test_fwf_file_colspecs_widths(kwargs):
- setup_fwf_file(overwrite=True)
-
- modin_df = pd.read_fwf(TEST_FWF_FILENAME, **kwargs)
- pandas_df = pd.read_fwf(TEST_FWF_FILENAME, **kwargs)
-
- df_equals(modin_df, pandas_df)
-
-
-@pytest.mark.parametrize("usecols", [["a"], ["a", "b", "d"], [0, 1, 3]])
-def test_fwf_file_usecols(usecols):
- fwf_data = """a b c d
-id8141 360.242940 149.910199 11950.7
-id1594 444.953632 166.985655 11788.4
-id1849 364.136849 183.628767 11806.2
-id1230 413.836124 184.375703 11916.8
-id1948 502.953953 173.237159 12468.3"""
-
- setup_fwf_file(overwrite=True, fwf_data=fwf_data)
-
- pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, usecols=usecols)
- modin_df = pd.read_fwf(TEST_FWF_FILENAME, usecols=usecols)
-
- df_equals(modin_df, pandas_df)
-
- teardown_fwf_file()
-
-
-def test_fwf_file_chunksize():
- setup_fwf_file(overwrite=True)
-
- # Tests __next__ and correctness of reader as an iterator
- rdf_reader = pd.read_fwf(TEST_FWF_FILENAME, chunksize=5)
- pd_reader = pandas.read_fwf(TEST_FWF_FILENAME, chunksize=5)
-
- for modin_df, pd_df in zip(rdf_reader, pd_reader):
- df_equals(modin_df, pd_df)
-
- # Tests that get_chunk works correctly
- rdf_reader = pd.read_fwf(TEST_FWF_FILENAME, chunksize=1)
- pd_reader = pandas.read_fwf(TEST_FWF_FILENAME, chunksize=1)
-
- modin_df = rdf_reader.get_chunk(1)
- pd_df = pd_reader.get_chunk(1)
-
- df_equals(modin_df, pd_df)
-
- # Tests that read works correctly
- rdf_reader = pd.read_fwf(TEST_FWF_FILENAME, chunksize=1)
- pd_reader = pandas.read_fwf(TEST_FWF_FILENAME, chunksize=1)
-
- modin_df = rdf_reader.read()
- pd_df = pd_reader.read()
-
- df_equals(modin_df, pd_df)
-
-
-@pytest.mark.parametrize("nrows", [13, None])
-def test_fwf_file_skiprows(nrows):
- setup_fwf_file(overwrite=True)
-
- pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, skiprows=2, nrows=nrows)
- modin_df = pd.read_fwf(TEST_FWF_FILENAME, skiprows=2, nrows=nrows)
- df_equals(modin_df, pandas_df)
-
- pandas_df = pandas.read_fwf(
- TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5], nrows=nrows
- )
- modin_df = pd.read_fwf(
- TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5], nrows=nrows
- )
- df_equals(modin_df, pandas_df)
-
-
-def test_fwf_file_index_col():
- fwf_data = """a b c d
-id8141 360.242940 149.910199 11950.7
-id1594 444.953632 166.985655 11788.4
-id1849 364.136849 183.628767 11806.2
-id1230 413.836124 184.375703 11916.8
-id1948 502.953953 173.237159 12468.3"""
-
- setup_fwf_file(overwrite=True, fwf_data=fwf_data)
-
- pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, index_col="c")
- modin_df = pd.read_fwf(TEST_FWF_FILENAME, index_col="c")
- df_equals(modin_df, pandas_df)
-
- teardown_fwf_file()
-
-
-def test_fwf_file_skipfooter():
- setup_fwf_file(overwrite=True)
-
- pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, skipfooter=2)
- modin_df = pd.read_fwf(TEST_FWF_FILENAME, skipfooter=2)
-
- df_equals(modin_df, pandas_df)
-
-
-def test_fwf_file_parse_dates():
- dates = pandas.date_range("2000", freq="h", periods=10)
- fwf_data = "col1 col2 col3 col4"
- for i in range(10, 20):
- fwf_data = fwf_data + "\n{col1} {col2} {col3} {col4}".format(
- col1=str(i),
- col2=str(dates[i - 10].date()),
- col3=str(i),
- col4=str(dates[i - 10].time()),
- )
-
- setup_fwf_file(overwrite=True, fwf_data=fwf_data)
-
- pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, parse_dates=[["col2", "col4"]])
- modin_df = pd.read_fwf(TEST_FWF_FILENAME, parse_dates=[["col2", "col4"]])
- df_equals(modin_df, pandas_df)
-
- pandas_df = pandas.read_fwf(
- TEST_FWF_FILENAME, parse_dates={"time": ["col2", "col4"]}
- )
- modin_df = pd.read_fwf(TEST_FWF_FILENAME, parse_dates={"time": ["col2", "col4"]})
- df_equals(modin_df, pandas_df)
-
- teardown_fwf_file()
-
-
-@pytest.mark.skip(reason="Need to verify GBQ access")
-def test_from_gbq():
- # Test API, but do not supply credentials until credits can be secured.
- with pytest.raises(
- ValueError, match="Could not determine project ID and one was not supplied."
- ):
- pd.read_gbq("SELECT 1")
-
-
-@pytest.mark.skip(reason="Need to verify GBQ access")
-def test_to_gbq():
- modin_df = create_test_modin_dataframe()
- # Test API, but do not supply credentials until credits can be secured.
- with pytest.raises(
- ValueError, match="Could not determine project ID and one was not supplied."
- ):
- modin_df.to_gbq("modin.table")
-
-
-def test_cleanup():
- filenames = [
- TEST_PARQUET_FILENAME,
- TEST_CSV_FILENAME,
- TEST_JSON_FILENAME,
- TEST_HTML_FILENAME,
- TEST_EXCEL_FILENAME,
- TEST_FEATHER_FILENAME,
- TEST_READ_HDF_FILENAME,
- TEST_WRITE_HDF_FILENAME_MODIN,
- TEST_WRITE_HDF_FILENAME_PANDAS,
- TEST_STATA_FILENAME,
- TEST_PICKLE_FILENAME,
- TEST_SAS_FILENAME,
- TEST_FWF_FILENAME,
- TEST_GBQ_FILENAME,
- ]
- for f in filenames:
- if os.path.exists(f):
- # Need try..except for Windows
- try:
- os.remove(f)
- except PermissionError:
- pass
-
-
-def test_from_arrow():
- pandas_df = create_test_pandas_dataframe()
- modin_df = from_arrow(pa.Table.from_pandas(pandas_df))
- df_equals(modin_df, pandas_df)
-
-
-@pytest.mark.parametrize(
- "kwargs",
- [
- {"names": [5, 1, 3, 4, 2, 6]},
- {"names": [0]},
- {"names": None, "usecols": [1, 0, 2]},
- {"names": [3, 1, 2, 5], "usecols": [4, 1, 3, 2]},
- ],
-)
-def test_csv_names_neq_num_cols(kwargs):
- file_name = "modin/pandas/test/data/issue_2074.csv"
- pandas_df = pandas.read_csv(file_name, **kwargs)
- modin_df = pd.read_csv(file_name, **kwargs)
- df_equals(modin_df, pandas_df)
+ modin_df, pandas_df = create_test_dfs(TEST_DATA, index=index)
+ df_equals(modin_df.to_period(), pandas_df.to_period())
diff --git a/modin/pandas/test/test_rolling.py b/modin/pandas/test/test_rolling.py
index 2dd935f8993..f15128afe3e 100644
--- a/modin/pandas/test/test_rolling.py
+++ b/modin/pandas/test/test_rolling.py
@@ -17,8 +17,9 @@
import modin.pandas as pd
from .utils import df_equals, test_data_values, test_data_keys
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
def create_test_series(vals):
@@ -60,7 +61,6 @@ def test_dataframe(data, window, min_periods, win_type):
df_equals(modin_rolled.std(ddof=0), pandas_rolled.std(ddof=0))
# Testing of Rolling class
else:
- df_equals(modin_rolled.count(), pandas_rolled.count())
df_equals(modin_rolled.sum(), pandas_rolled.sum())
df_equals(modin_rolled.mean(), pandas_rolled.mean())
df_equals(modin_rolled.median(), pandas_rolled.median())
@@ -77,6 +77,11 @@ def test_dataframe(data, window, min_periods, win_type):
pandas_rolled.aggregate([np.sum, np.mean]),
)
df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1))
+ # `Rolling.counts` has a buggy side-effect on other rolling functions described in:
+ # https://github.com/pandas-dev/pandas/issues/39554
+ # So the testing of `.count` should always be the last until this bug
+ # will be fixed in pandas, to avoid this side-effect
+ df_equals(modin_rolled.count(), pandas_rolled.count())
@pytest.mark.parametrize("axis", [0, "columns"])
@@ -120,7 +125,6 @@ def test_dataframe_dt_index(axis, on, closed, window):
pandas_rolled.corr(pandas_df[pandas_df.columns[0]], True),
)
else:
- df_equals(modin_rolled.count(), pandas_rolled.count())
df_equals(modin_rolled.skew(), pandas_rolled.skew())
df_equals(
modin_rolled.apply(np.sum, raw=True),
@@ -128,6 +132,11 @@ def test_dataframe_dt_index(axis, on, closed, window):
)
df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum))
df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1))
+ # `Rolling.counts` has a buggy side-effect on other rolling functions described in:
+ # https://github.com/pandas-dev/pandas/issues/39554
+ # So the testing of `.count` should always be the last until this bug
+ # will be fixed in pandas, to avoid this side-effect
+ df_equals(modin_rolled.count(), pandas_rolled.count())
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -158,7 +167,6 @@ def test_series(data, window, min_periods, win_type):
df_equals(modin_rolled.std(ddof=0), pandas_rolled.std(ddof=0))
# Testing of Rolling class
else:
- df_equals(modin_rolled.count(), pandas_rolled.count())
df_equals(modin_rolled.sum(), pandas_rolled.sum())
df_equals(modin_rolled.mean(), pandas_rolled.mean())
df_equals(modin_rolled.median(), pandas_rolled.median())
@@ -186,6 +194,11 @@ def test_series(data, window, min_periods, win_type):
pandas_rolled.agg([np.sum, np.mean]),
)
df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1))
+ # `Rolling.counts` has a buggy side-effect on other rolling functions described in:
+ # https://github.com/pandas-dev/pandas/issues/39554
+ # So the testing of `.count` should always be the last until this bug
+ # will be fixed in pandas, to avoid this side-effect
+ df_equals(modin_rolled.count(), pandas_rolled.count())
@pytest.mark.parametrize("closed", ["both", "right"])
@@ -196,10 +209,14 @@ def test_series_dt_index(closed):
pandas_rolled = pandas_series.rolling("3s", closed=closed)
modin_rolled = modin_series.rolling("3s", closed=closed)
- df_equals(modin_rolled.count(), pandas_rolled.count())
df_equals(modin_rolled.skew(), pandas_rolled.skew())
df_equals(
modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True)
)
df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum))
df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1))
+ # `Rolling.counts` has a buggy side-effect on other rolling functions described in:
+ # https://github.com/pandas-dev/pandas/issues/39554
+ # So the testing of `.count` should always be the last until this bug
+ # will be fixed in pandas, to avoid this side-effect
+ df_equals(modin_rolled.count(), pandas_rolled.count())
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
index ec4aba6879a..86b4e876525 100644
--- a/modin/pandas/test/test_series.py
+++ b/modin/pandas/test/test_series.py
@@ -21,7 +21,7 @@
from pandas.core.base import SpecificationError
import sys
-from modin.utils import to_pandas, get_current_backend
+from modin.utils import to_pandas
from .utils import (
random_state,
RAND_LOW,
@@ -67,8 +67,9 @@
generate_multiindex,
test_data_diff_dtype,
)
+from modin.config import NPartitions
-pd.DEFAULT_NPARTITIONS = 4
+NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
@@ -179,13 +180,13 @@ def inter_df_math_helper_one_side(modin_series, pandas_series, op):
pass
-def create_test_series(vals, sort=False):
+def create_test_series(vals, sort=False, **kwargs):
if isinstance(vals, dict):
- modin_series = pd.Series(vals[next(iter(vals.keys()))])
- pandas_series = pandas.Series(vals[next(iter(vals.keys()))])
+ modin_series = pd.Series(vals[next(iter(vals.keys()))], **kwargs)
+ pandas_series = pandas.Series(vals[next(iter(vals.keys()))], **kwargs)
else:
- modin_series = pd.Series(vals)
- pandas_series = pandas.Series(vals)
+ modin_series = pd.Series(vals, **kwargs)
+ pandas_series = pandas.Series(vals, **kwargs)
if sort:
modin_series = modin_series.sort_values().reset_index(drop=True)
pandas_series = pandas_series.sort_values().reset_index(drop=True)
@@ -306,12 +307,6 @@ def test___delitem__(data):
df_equals(modin_series, pandas_series)
-@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
-def test___div__(data):
- modin_series, pandas_series = create_test_series(data)
- inter_df_math_helper(modin_series, pandas_series, "__div__")
-
-
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_divmod(data):
modin_series, pandas_series = create_test_series(data)
@@ -531,6 +526,39 @@ def test___setitem__(data):
df_equals(modin_series, pandas_series)
+@pytest.mark.parametrize(
+ "key",
+ [
+ pytest.param(lambda idx: slice(1, 3), id="location_based_slice"),
+ pytest.param(lambda idx: slice(idx[1], idx[-1]), id="index_based_slice"),
+ pytest.param(lambda idx: [idx[0], idx[2], idx[-1]], id="list_of_labels"),
+ pytest.param(
+ lambda idx: [True if i % 2 else False for i in range(len(idx))],
+ id="boolean_mask",
+ ),
+ ],
+)
+@pytest.mark.parametrize(
+ "index",
+ [
+ pytest.param(
+ lambda idx_len: [chr(x) for x in range(ord("a"), ord("a") + idx_len)],
+ id="str_index",
+ ),
+ pytest.param(lambda idx_len: list(range(1, idx_len + 1)), id="int_index"),
+ ],
+)
+def test___setitem___non_hashable(key, index):
+ data = np.arange(5)
+ index = index(len(data))
+ key = key(index)
+ md_sr, pd_sr = create_test_series(data, index=index)
+
+ md_sr[key] = 10
+ pd_sr[key] = 10
+ df_equals(md_sr, pd_sr)
+
+
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___sizeof__(data):
modin_series, pandas_series = create_test_series(data)
@@ -593,16 +621,10 @@ def test_add_suffix(data):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys)
def test_agg(data, func):
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_series(data),
- lambda df: df.agg(func),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_series(data),
+ lambda df: df.agg(func),
+ )
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -624,16 +646,10 @@ def test_agg_numeric(request, data, func):
request.node.name, numeric_dfs
):
axis = 0
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_series(data),
- lambda df: df.agg(func, axis),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_series(data),
+ lambda df: df.agg(func, axis),
+ )
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -656,16 +672,10 @@ def test_agg_numeric_except(request, data, func):
@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys)
def test_aggregate(data, func):
axis = 0
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_series(data),
- lambda df: df.aggregate(func, axis),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_series(data),
+ lambda df: df.aggregate(func, axis),
+ )
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -688,16 +698,10 @@ def test_aggregate_numeric(request, data, func):
request.node.name, numeric_dfs
):
axis = 0
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_series(data),
- lambda df: df.agg(func, axis),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_series(data),
+ lambda df: df.agg(func, axis),
+ )
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -823,16 +827,10 @@ def test_append(data):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys)
def test_apply(data, func):
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_series(data),
- lambda df: df.apply(func),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_series(data),
+ lambda df: df.apply(func),
+ )
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -871,16 +869,10 @@ def test_apply_external_lib():
@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys)
def test_apply_numeric(request, data, func):
if name_contains(request.node.name, numeric_dfs):
- # AssertionError may be arisen in case of
- # mismathing of index/columns in Modin and pandas.
- # See details in pandas issue 36189.
- try:
- eval_general(
- *create_test_series(data),
- lambda df: df.apply(func),
- )
- except AssertionError:
- pass
+ eval_general(
+ *create_test_series(data),
+ lambda df: df.apply(func),
+ )
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -2491,6 +2483,15 @@ def test_reindex(data):
pandas_series.reindex(index=[0, 1, 5]),
)
+ # MultiIndex
+ modin_series, pandas_series = create_test_series(data)
+ modin_series.index, pandas_series.index = [
+ generate_multiindex(len(pandas_series))
+ ] * 2
+ pandas_result = pandas_series.reindex(list(reversed(pandas_series.index)))
+ modin_result = modin_series.reindex(list(reversed(modin_series.index)))
+ df_equals(pandas_result, modin_result)
+
def test_reindex_like():
df1 = pd.DataFrame(
@@ -2903,6 +2904,7 @@ def test_shift(data):
df_equals(modin_series.shift(fill_value=777), pandas_series.shift(fill_value=777))
df_equals(modin_series.shift(periods=7), pandas_series.shift(periods=7))
df_equals(modin_series.shift(periods=-3), pandas_series.shift(periods=-3))
+ eval_general(modin_series, pandas_series, lambda df: df.shift(axis=1))
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -2949,30 +2951,27 @@ def test_slice_shift(data, index, periods):
@pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"])
def test_sort_index(data, ascending, sort_remaining, na_position):
modin_series, pandas_series = create_test_series(data)
- df_equals(
- modin_series.sort_index(
- ascending=ascending, sort_remaining=sort_remaining, na_position=na_position
- ),
- pandas_series.sort_index(
- ascending=ascending, sort_remaining=sort_remaining, na_position=na_position
+ eval_general(
+ modin_series,
+ pandas_series,
+ lambda df: df.sort_index(
+ ascending=ascending,
+ sort_remaining=sort_remaining,
+ na_position=na_position,
),
)
- modin_series_cp = modin_series.copy()
- pandas_series_cp = pandas_series.copy()
- modin_series_cp.sort_index(
- ascending=ascending,
- sort_remaining=sort_remaining,
- na_position=na_position,
- inplace=True,
- )
- pandas_series_cp.sort_index(
- ascending=ascending,
- sort_remaining=sort_remaining,
- na_position=na_position,
- inplace=True,
+ eval_general(
+ modin_series.copy(),
+ pandas_series.copy(),
+ lambda df: df.sort_index(
+ ascending=ascending,
+ sort_remaining=sort_remaining,
+ na_position=na_position,
+ inplace=True,
+ ),
+ __inplace__=True,
)
- df_equals(modin_series_cp, pandas_series_cp)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -3219,17 +3218,10 @@ def test_transform(data, func):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("func", agg_func_except_values, ids=agg_func_except_keys)
def test_transform_except(data, func):
- # 1) SpecificationError is arisen because we treat a Series as a DataFrame.
- # See details in pandas issues 36036.
- # 2) AssertionError is arisen because of mismatching of thrown exceptions
- # (SpecificationError in Modin, ValueError in pandas).
- # Since we perform `transform` via `apply` then SpecificationError is arisen earlier.
- # That's why the exception are mismathed.
- with pytest.raises((SpecificationError, AssertionError)):
- eval_general(
- *create_test_series(data),
- lambda df: df.transform(func),
- )
+ eval_general(
+ *create_test_series(data),
+ lambda df: df.transform(func),
+ )
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -3322,16 +3314,19 @@ def test_unique(data):
modin_result = modin_series.unique()
pandas_result = pandas_series.unique()
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.Series([2, 1, 3, 3], name="A").unique()
pandas_result = pandas.Series([2, 1, 3, 3], name="A").unique()
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique()
pandas_result = pandas.Series(
[pd.Timestamp("2016-01-01") for _ in range(3)]
).unique()
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.Series(
[pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)]
@@ -3340,10 +3335,12 @@ def test_unique(data):
[pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)]
).unique()
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pandas.Series(pd.Categorical(list("baabc"))).unique()
pandas_result = pd.Series(pd.Categorical(list("baabc"))).unique()
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
modin_result = pd.Series(
pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
@@ -3352,6 +3349,7 @@ def test_unique(data):
pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
).unique()
assert_array_equal(modin_result, pandas_result)
+ assert modin_result.shape == pandas_result.shape
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -3411,35 +3409,50 @@ def sort_index_for_equal_values(result, ascending):
i += 1
return type(result)(result, index=new_index)
- # We sort indices for pandas result because of issue #1650
+ # We sort indices for Modin and pandas result because of issue #1650
modin_series, pandas_series = create_test_series(test_data_values[0])
- modin_result = modin_series.value_counts(normalize=normalize, ascending=False)
-
- if get_current_backend() == "BaseOnPython":
- modin_result = sort_index_for_equal_values(modin_result, ascending=False)
+ modin_result = sort_index_for_equal_values(
+ modin_series.value_counts(normalize=normalize, ascending=False), False
+ )
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(normalize=normalize, ascending=False), False
)
df_equals(modin_result, pandas_result)
- modin_result = modin_series.value_counts(bins=bins, ascending=False)
-
- if get_current_backend() == "BaseOnPython":
- modin_result = sort_index_for_equal_values(modin_result, ascending=False)
-
+ modin_result = sort_index_for_equal_values(
+ modin_series.value_counts(bins=bins, ascending=False), False
+ )
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(bins=bins, ascending=False), False
)
df_equals(modin_result, pandas_result)
- modin_result = modin_series.value_counts(dropna=dropna, ascending=True)
+ modin_result = sort_index_for_equal_values(
+ modin_series.value_counts(dropna=dropna, ascending=True), True
+ )
+ pandas_result = sort_index_for_equal_values(
+ pandas_series.value_counts(dropna=dropna, ascending=True), True
+ )
+ df_equals(modin_result, pandas_result)
- if get_current_backend() == "BaseOnPython":
- modin_result = sort_index_for_equal_values(modin_result, ascending=True)
+ # from issue #2365
+ arr = np.random.rand(2 ** 6)
+ arr[::10] = np.nan
+ modin_series, pandas_series = create_test_series(arr)
+ modin_result = sort_index_for_equal_values(
+ modin_series.value_counts(dropna=False, ascending=True), True
+ )
+ pandas_result = sort_index_for_equal_values(
+ pandas_series.value_counts(dropna=False, ascending=True), True
+ )
+ df_equals(modin_result, pandas_result)
+ modin_result = sort_index_for_equal_values(
+ modin_series.value_counts(dropna=False, ascending=False), False
+ )
pandas_result = sort_index_for_equal_values(
- pandas_series.value_counts(dropna=dropna, ascending=True), True
+ pandas_series.value_counts(dropna=False, ascending=False), False
)
df_equals(modin_result, pandas_result)
@@ -4397,17 +4410,18 @@ def test_encode(data, encoding_type):
df_equals(modin_result, pandas_result)
-@pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys)
-def test_hasattr_sparse(data):
- modin_series, pandas_series = create_test_series(data)
- try:
- pandas_result = hasattr(pandas_series, "sparse")
- except Exception as e:
- with pytest.raises(type(e)):
- hasattr(modin_series, "sparse")
- else:
- modin_result = hasattr(modin_series, "sparse")
- assert modin_result == pandas_result
+@pytest.mark.parametrize(
+ "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"]
+)
+def test_hasattr_sparse(is_sparse_data):
+ modin_df, pandas_df = (
+ create_test_series(
+ pandas.arrays.SparseArray(test_data["float_nan_data"].values())
+ )
+ if is_sparse_data
+ else create_test_series(test_data["float_nan_data"])
+ )
+ eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))
@pytest.mark.parametrize(
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
index 719c0778b30..9e4b33b95e9 100644
--- a/modin/pandas/test/utils.py
+++ b/modin/pandas/test/utils.py
@@ -15,31 +15,40 @@
import numpy as np
import math
import pandas
-from pandas.util.testing import (
- assert_almost_equal,
+from pandas.testing import (
+ assert_series_equal,
assert_frame_equal,
- assert_categorical_equal,
+ assert_index_equal,
+ assert_extension_array_equal,
)
import modin.pandas as pd
from modin.utils import to_pandas
-from modin.config import TestDatasetSize
+from modin.config import TestDatasetSize, TrackFileLeaks
from io import BytesIO
+import os
+from string import ascii_letters
+import csv
+import psutil
+import functools
random_state = np.random.RandomState(seed=42)
DATASET_SIZE_DICT = {
- "small": (2 ** 2, 2 ** 3),
- "normal": (2 ** 6, 2 ** 8),
- "big": (2 ** 7, 2 ** 12),
+ "Small": (2 ** 2, 2 ** 3),
+ "Normal": (2 ** 6, 2 ** 8),
+ "Big": (2 ** 7, 2 ** 12),
}
# Size of test dataframes
-NCOLS, NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["normal"])
+NCOLS, NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["Normal"])
# Range for values for test data
RAND_LOW = 0
RAND_HIGH = 100
+# Directory for storing I/O operations test data
+IO_OPS_DATA_DIR = os.path.join(os.path.dirname(__file__), "io_tests_data")
+
# Input data and functions for the tests
# The test data that we will test our code against
test_data = {
@@ -419,11 +428,18 @@
"utf_8_sig",
]
+# raising of this exceptions can be caused by unexpected behavior
+# of I/O operation test, but can passed by eval_io function since
+# the type of this exceptions are the same
+io_ops_bad_exc = [TypeError, FileNotFoundError]
+
+# Files compression to extension mapping
+COMP_TO_EXT = {"gzip": "gz", "bz2": "bz2", "xz": "xz", "zip": "zip"}
+
def categories_equals(left, right):
assert (left.ordered and right.ordered) or (not left.ordered and not right.ordered)
- is_category_ordered = left.ordered
- assert_categorical_equal(left, right, check_category_order=is_category_ordered)
+ assert_extension_array_equal(left, right)
def df_categories_equals(df1, df2):
@@ -439,12 +455,10 @@ def df_categories_equals(df1, df2):
categories_columns = df1.select_dtypes(include="category").columns
for column in categories_columns:
- is_category_ordered = df1[column].dtype.ordered
- assert_categorical_equal(
+ assert_extension_array_equal(
df1[column].values,
df2[column].values,
check_dtype=False,
- check_category_order=is_category_ordered,
)
@@ -458,12 +472,6 @@ def df_equals(df1, df2):
Returns:
True if df1 is equal to df2.
"""
- types_for_almost_equals = (
- pandas.core.indexes.range.RangeIndex,
- pandas.core.indexes.base.Index,
- np.recarray,
- )
-
# Gets AttributError if modin's groupby object is not import like this
from modin.pandas.groupby import DataFrameGroupBy
@@ -522,12 +530,10 @@ def df_equals(df1, df2):
check_categorical=False,
)
df_categories_equals(df1, df2)
- elif isinstance(df1, types_for_almost_equals) and isinstance(
- df2, types_for_almost_equals
- ):
- assert_almost_equal(df1, df2, check_dtype=False)
+ elif isinstance(df1, pandas.Index) and isinstance(df2, pandas.Index):
+ assert_index_equal(df1, df2)
elif isinstance(df1, pandas.Series) and isinstance(df2, pandas.Series):
- assert_almost_equal(df1, df2, check_dtype=False, check_series_type=False)
+ assert_series_equal(df1, df2, check_dtype=False, check_series_type=False)
elif isinstance(df1, groupby_types) and isinstance(df2, groupby_types):
for g1, g2 in zip(df1, df2):
assert g1[0] == g2[0]
@@ -543,6 +549,8 @@ def df_equals(df1, df2):
elif isinstance(df1, pandas.core.arrays.numpy_.PandasArray):
assert isinstance(df2, pandas.core.arrays.numpy_.PandasArray)
assert df1 == df2
+ elif isinstance(df1, np.recarray) and isinstance(df2, np.recarray):
+ np.testing.assert_array_equal(df1, df2)
else:
if df1 != df2:
np.testing.assert_almost_equal(df1, df2)
@@ -638,8 +646,15 @@ def eval_general(
comparator=df_equals,
__inplace__=False,
check_exception_type=True,
+ raising_exceptions=None,
+ check_kwargs_callable=True,
+ md_extra_kwargs=None,
**kwargs,
):
+ if raising_exceptions:
+ assert (
+ check_exception_type
+ ), "if raising_exceptions is not None or False, check_exception_type should be True"
md_kwargs, pd_kwargs = {}, {}
def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
@@ -653,12 +668,16 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
repr(fn(modin_df, **md_kwargs))
if check_exception_type:
assert isinstance(md_e.value, type(pd_e))
+ if raising_exceptions:
+ assert not isinstance(
+ md_e.value, tuple(raising_exceptions)
+ ), f"not acceptable exception type: {md_e.value}"
else:
md_result = fn(modin_df, **md_kwargs)
return (md_result, pd_result) if not __inplace__ else (modin_df, pandas_df)
for key, value in kwargs.items():
- if callable(value):
+ if check_kwargs_callable and callable(value):
values = execute_callable(value)
# that means, that callable raised an exception
if values is None:
@@ -671,6 +690,10 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
md_kwargs[key] = md_value
pd_kwargs[key] = pd_value
+ if md_extra_kwargs:
+ assert isinstance(md_extra_kwargs, dict)
+ md_kwargs.update(md_extra_kwargs)
+
values = execute_callable(
operation, md_kwargs=md_kwargs, pd_kwargs=pd_kwargs, inplace=__inplace__
)
@@ -678,8 +701,103 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
comparator(*values)
+def eval_io(
+ fn_name,
+ comparator=df_equals,
+ cast_to_str=False,
+ check_exception_type=True,
+ raising_exceptions=io_ops_bad_exc,
+ check_kwargs_callable=True,
+ modin_warning=None,
+ md_extra_kwargs=None,
+ *args,
+ **kwargs,
+):
+ """Evaluate I/O operation outputs equality check.
+
+ Parameters
+ ----------
+ fn_name: str
+ I/O operation name ("read_csv" for example).
+ comparator: obj
+ Function to perform comparison.
+ cast_to_str: bool
+ There could be some missmatches in dtypes, so we're
+ casting the whole frame to `str` before comparison.
+ See issue #1931 for details.
+ check_exception_type: bool
+ Check or not exception types in the case of operation fail
+ (compare exceptions types raised by Pandas and Modin).
+ raising_exceptions: Exception or list of Exceptions
+ Exceptions that should be raised even if they are raised
+ both by Pandas and Modin (check evaluated only if
+ `check_exception_type` passed as `True`).
+ modin_warning: obj
+ Warning that should be raised by Modin.
+ md_extra_kwargs: dict
+ Modin operation specific kwargs.
+ """
+
+ def applyier(module, *args, **kwargs):
+ result = getattr(module, fn_name)(*args, **kwargs)
+ if cast_to_str:
+ result = result.astype(str)
+ return result
+
+ def call_eval_general():
+ eval_general(
+ pd,
+ pandas,
+ applyier,
+ check_exception_type=check_exception_type,
+ raising_exceptions=raising_exceptions,
+ check_kwargs_callable=check_kwargs_callable,
+ md_extra_kwargs=md_extra_kwargs,
+ *args,
+ **kwargs,
+ )
+
+ if modin_warning:
+ with pytest.warns(modin_warning):
+ call_eval_general()
+ else:
+ call_eval_general()
+
+
+def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs):
+ """Evaluate I/O operation outputs equality check by using `csv_str`
+ data passed as python str (csv test file will be created from `csv_str`).
+
+ Parameters
+ ----------
+ csv_str: str
+ Test data for storing to csv file.
+ unique_filename: str
+ csv file name.
+ """
+ try:
+ with open(unique_filename, "w") as f:
+ f.write(csv_str)
+
+ eval_io(
+ filepath_or_buffer=unique_filename,
+ fn_name="read_csv",
+ **kwargs,
+ )
+
+ finally:
+ if os.path.exists(unique_filename):
+ try:
+ os.remove(unique_filename)
+ except PermissionError:
+ pass
+
+
def create_test_dfs(*args, **kwargs):
- return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)
+ post_fn = kwargs.pop("post_fn", lambda df: df)
+ return map(
+ post_fn, [pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)]
+ )
def generate_dfs():
@@ -779,3 +897,322 @@ def generate_none_dfs():
}
)
return df, df2
+
+
+def get_unique_filename(
+ test_name: str = "test",
+ kwargs: dict = {},
+ extension: str = "csv",
+ data_dir: str = IO_OPS_DATA_DIR,
+ suffix: str = "",
+ debug_mode=False,
+):
+ """Returns unique file name with specified parameters.
+
+ Parameters
+ ----------
+ test_name: str
+ name of the test for which the unique file name is needed.
+ kwargs: list of ints
+ Unique combiantion of test parameters for creation of unique name.
+ extension: str
+ Extension of unique file.
+ data_dir: str
+ Data directory where test files will be created.
+ suffix: str
+ String to append to the resulted name.
+ debug_mode: bool
+ Get unique filename containing kwargs values.
+ Otherwise kwargs values will be replaced with hash equivalent.
+
+ Returns
+ -------
+ Unique file name.
+ """
+ suffix_part = f"_{suffix}" if suffix else ""
+ extension_part = f".{extension}" if extension else ""
+ if debug_mode:
+ # shortcut if kwargs parameter are not provided
+ if len(kwargs) == 0 and extension == "csv" and suffix == "":
+ return os.path.join(data_dir, (test_name + suffix_part + f".{extension}"))
+
+ assert "." not in extension, "please provide pure extension name without '.'"
+ prohibited_chars = ['"', "\n"]
+ non_prohibited_char = "np_char"
+ char_counter = 0
+ kwargs_name = dict(kwargs)
+ for key, value in kwargs_name.items():
+ for char in prohibited_chars:
+ if isinstance(value, str) and char in value or callable(value):
+ kwargs_name[key] = non_prohibited_char + str(char_counter)
+ char_counter += 1
+ parameters_values = "_".join(
+ [
+ str(value)
+ if not isinstance(value, (list, tuple))
+ else "_".join([str(x) for x in value])
+ for value in kwargs_name.values()
+ ]
+ )
+ return os.path.join(
+ data_dir, test_name + parameters_values + suffix_part + extension_part
+ )
+ else:
+ import uuid
+
+ return os.path.join(data_dir, uuid.uuid1().hex + suffix_part + extension_part)
+
+
+def get_random_string():
+ random_string = "".join(
+ random_state.choice([x for x in ascii_letters], size=10).tolist()
+ )
+ return random_string
+
+
+def insert_lines_to_csv(
+ csv_name: str,
+ lines_positions: list,
+ lines_type: str = "blank",
+ encoding: str = None,
+ **csv_reader_writer_params,
+):
+ """Insert lines to ".csv" file.
+
+ Parameters
+ ----------
+ csv_name: str
+ ".csv" file that should be modified.
+ lines_positions: list of ints
+ Lines postions that sghould be modified (serial number
+ of line - begins from 0, ends in - 1).
+ lines_type: str
+ Lines types that should be inserted to ".csv" file. Possible types:
+ "blank" - empty line without any delimiters/separators,
+ "bad" - lines with len(lines_data) > cols_number
+ encoding: str
+ Encoding type that should be used during file reading and writing.
+ """
+ cols_number = len(pandas.read_csv(csv_name, nrows=1).columns)
+ if lines_type == "blank":
+ lines_data = []
+ elif lines_type == "bad":
+ cols_number = len(pandas.read_csv(csv_name, nrows=1).columns)
+ lines_data = [x for x in range(cols_number + 1)]
+ else:
+ raise ValueError(
+ f"acceptable values for parameter are ['blank', 'bad'], actually passed {lines_type}"
+ )
+ lines = []
+ dialect = "excel"
+ with open(csv_name, "r", encoding=encoding, newline="") as read_file:
+ try:
+ dialect = csv.Sniffer().sniff(read_file.read())
+ read_file.seek(0)
+ except Exception:
+ dialect = None
+
+ reader = csv.reader(
+ read_file,
+ dialect=dialect if dialect is not None else "excel",
+ **csv_reader_writer_params,
+ )
+ counter = 0
+ for row in reader:
+ if counter in lines_positions:
+ lines.append(lines_data)
+ else:
+ lines.append(row)
+ counter += 1
+ with open(csv_name, "w", encoding=encoding, newline="") as write_file:
+ writer = csv.writer(
+ write_file,
+ dialect=dialect if dialect is not None else "excel",
+ **csv_reader_writer_params,
+ )
+ writer.writerows(lines)
+
+
+def _get_open_files():
+ """
+ psutil open_files() can return a lot of extra information that we can allow to
+ be different, like file position; for simplicity we care about path and fd only.
+ """
+ return sorted((info.path, info.fd) for info in psutil.Process().open_files())
+
+
+def check_file_leaks(func):
+ """
+ A decorator that ensures that no *newly* opened file handles are left
+ after decorated function is finished.
+ """
+ if not TrackFileLeaks.get():
+ return func
+
+ @functools.wraps(func)
+ def check(*a, **kw):
+ fstart = _get_open_files()
+ try:
+ return func(*a, **kw)
+ finally:
+ leaks = []
+ for item in _get_open_files():
+ try:
+ fstart.remove(item)
+ except ValueError:
+ # ignore files in /proc/, as they have nothing to do with
+ # modin reading any data (and this is what we care about)
+ if not item[0].startswith("/proc/"):
+ leaks.append(item)
+ assert (
+ not leaks
+ ), f"Unexpected open handles left for: {', '.join(item[0] for item in leaks)}"
+
+ return check
+
+
+def dummy_decorator():
+ """A problematic decorator that does not use `functools.wraps`. This introduces unwanted local variables for
+ inspect.currentframe. This decorator is used in test_io to test `read_csv` and `read_table`
+ """
+
+ def wrapper(method):
+ def wrapped_function(self, *args, **kwargs):
+ result = method(self, *args, **kwargs)
+ return result
+
+ return wrapped_function
+
+ return wrapper
+
+
+def _make_csv_file(filenames):
+ def _csv_file_maker(
+ filename,
+ row_size=NROWS,
+ force=True,
+ delimiter=",",
+ encoding=None,
+ compression="infer",
+ additional_col_values=None,
+ remove_randomness=False,
+ add_blank_lines=False,
+ add_bad_lines=False,
+ add_nan_lines=False,
+ thousands_separator=None,
+ decimal_separator=None,
+ comment_col_char=None,
+ quoting=csv.QUOTE_MINIMAL,
+ quotechar='"',
+ doublequote=True,
+ escapechar=None,
+ line_terminator=None,
+ ):
+ if os.path.exists(filename) and not force:
+ pass
+ else:
+ dates = pandas.date_range("2000", freq="h", periods=row_size)
+ data = {
+ "col1": np.arange(row_size) * 10,
+ "col2": [str(x.date()) for x in dates],
+ "col3": np.arange(row_size) * 10,
+ "col4": [str(x.time()) for x in dates],
+ "col5": [get_random_string() for _ in range(row_size)],
+ "col6": random_state.uniform(low=0.0, high=10000.0, size=row_size),
+ }
+
+ if additional_col_values is not None:
+ assert isinstance(additional_col_values, (list, tuple))
+ data.update(
+ {
+ "col7": random_state.choice(
+ additional_col_values, size=row_size
+ ),
+ }
+ )
+ df = pandas.DataFrame(data)
+ if remove_randomness:
+ df = df[["col1", "col2", "col3", "col4"]]
+ if add_nan_lines:
+ for i in range(0, row_size, row_size // (row_size // 10)):
+ df.loc[i] = pandas.Series()
+ if comment_col_char:
+ char = comment_col_char if isinstance(comment_col_char, str) else "#"
+ df.insert(
+ loc=0,
+ column="col_with_comments",
+ value=[char if (x + 2) == 0 else x for x in range(row_size)],
+ )
+
+ if thousands_separator:
+ for col_id in ["col1", "col3"]:
+ df[col_id] = df[col_id].apply(
+ lambda x: f"{x:,d}".replace(",", thousands_separator)
+ )
+ df["col6"] = df["col6"].apply(
+ lambda x: f"{x:,f}".replace(",", thousands_separator)
+ )
+ filename = (
+ f"{filename}.{COMP_TO_EXT[compression]}"
+ if compression != "infer"
+ else filename
+ )
+ df.to_csv(
+ filename,
+ sep=delimiter,
+ encoding=encoding,
+ compression=compression,
+ index=False,
+ decimal=decimal_separator if decimal_separator else ".",
+ line_terminator=line_terminator,
+ quoting=quoting,
+ quotechar=quotechar,
+ doublequote=doublequote,
+ escapechar=escapechar,
+ )
+ csv_reader_writer_params = {
+ "delimiter": delimiter,
+ "doublequote": doublequote,
+ "escapechar": escapechar,
+ "lineterminator": line_terminator if line_terminator else os.linesep,
+ "quotechar": quotechar,
+ "quoting": quoting,
+ }
+ if add_blank_lines:
+ insert_lines_to_csv(
+ csv_name=filename,
+ lines_positions=[
+ x for x in range(5, row_size, row_size // (row_size // 10))
+ ],
+ lines_type="blank",
+ encoding=encoding,
+ **csv_reader_writer_params,
+ )
+ if add_bad_lines:
+ insert_lines_to_csv(
+ csv_name=filename,
+ lines_positions=[
+ x for x in range(6, row_size, row_size // (row_size // 10))
+ ],
+ lines_type="bad",
+ encoding=encoding,
+ **csv_reader_writer_params,
+ )
+ filenames.append(filename)
+ return df
+
+ return _csv_file_maker
+
+
+def teardown_test_file(test_path):
+ if os.path.exists(test_path):
+ # PermissionError can occure because of issue #2533
+ try:
+ os.remove(test_path)
+ except PermissionError:
+ pass
+
+
+def teardown_test_files(test_paths: list):
+ for path in test_paths:
+ teardown_test_file(path)
diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py
index 1c631514c15..bb1f459f411 100644
--- a/modin/pandas/utils.py
+++ b/modin/pandas/utils.py
@@ -13,6 +13,8 @@
"""Implement utils for pandas component."""
+from pandas import MultiIndex
+
def from_non_pandas(df, index, columns, dtype):
"""
@@ -24,7 +26,7 @@ def from_non_pandas(df, index, columns, dtype):
----------
What arguments does this function have.
[
- PARAMETER_NAME: PARAMETERS TYPES
+ PARAMETER_NAME : PARAMETERS TYPES
Description.
]
@@ -48,7 +50,7 @@ def from_pandas(df):
Parameters
----------
- df: pandas.DataFrame
+ df : pandas.DataFrame
The pandas DataFrame to convert.
Returns
@@ -67,7 +69,7 @@ def from_arrow(at):
Parameters
----------
- at: Arrow Table
+ at : Arrow Table
The Arrow Table to convert from.
Returns
@@ -93,7 +95,7 @@ def is_scalar(obj):
Parameters
----------
- val: object
+ val : object
Object to check.
Returns
@@ -105,3 +107,38 @@ def is_scalar(obj):
from .base import BasePandasDataset
return not isinstance(obj, BasePandasDataset) and pandas_is_scalar(obj)
+
+
+def from_modin_frame_to_mi(df, sortorder=None, names=None):
+ """
+ Make a pandas.MultiIndex from a DataFrame.
+
+ Parameters
+ ----------
+ df : DataFrame
+ DataFrame to be converted to pandas.MultiIndex.
+ sortorder : int, optional
+ Level of sortedness (must be lexicographically sorted by that
+ level).
+ names : list-like, optional
+ If no names are provided, use the column names, or tuple of column
+ names if the columns is a MultiIndex. If a sequence, overwrite
+ names with the given sequence.
+
+ Returns
+ -------
+ pandas.MultiIndex
+ The pandas.MultiIndex representation of the given DataFrame.
+ """
+ from .dataframe import DataFrame
+
+ if isinstance(df, DataFrame):
+ from modin.error_message import ErrorMessage
+
+ ErrorMessage.default_to_pandas("`MultiIndex.from_frame`")
+ df = df._to_pandas()
+ return _original_pandas_MultiIndex_from_frame(df, sortorder, names)
+
+
+_original_pandas_MultiIndex_from_frame = MultiIndex.from_frame
+MultiIndex.from_frame = from_modin_frame_to_mi
diff --git a/modin/test/__init__.py b/modin/test/__init__.py
new file mode 100644
index 00000000000..cae6413e559
--- /dev/null
+++ b/modin/test/__init__.py
@@ -0,0 +1,12 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
diff --git a/modin/test/backends/base/test_internals.py b/modin/test/backends/base/test_internals.py
new file mode 100644
index 00000000000..be26075ae72
--- /dev/null
+++ b/modin/test/backends/base/test_internals.py
@@ -0,0 +1,84 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import pandas
+import pytest
+
+from modin.pandas.test.utils import (
+ test_data_values,
+ create_test_dfs,
+ df_equals,
+)
+from modin.config import NPartitions
+
+NPartitions.put(4)
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("item_length", [0, 1, 2])
+@pytest.mark.parametrize("loc", ["first", "first + 1", "middle", "penult", "last"])
+@pytest.mark.parametrize("replace", [True, False])
+def test_insert_item(axis, item_length, loc, replace):
+ data = test_data_values[0]
+
+ def post_fn(df):
+ return (
+ (df.iloc[:, :-item_length], df.iloc[:, -item_length:])
+ if axis
+ else (df.iloc[:-item_length, :], df.iloc[-item_length:, :])
+ )
+
+ def get_loc(frame, loc):
+ locs_dict = {
+ "first": 0,
+ "first + 1": 1,
+ "middle": len(frame.axes[axis]) // 2,
+ "penult": len(frame.axes[axis]) - 1,
+ "last": len(frame.axes[axis]),
+ }
+ return locs_dict[loc]
+
+ def get_reference(df, value, loc):
+ if axis == 0:
+ first_mask = df.iloc[:loc]
+ if replace:
+ loc += 1
+ second_mask = df.iloc[loc:]
+ else:
+ first_mask = df.iloc[:, :loc]
+ if replace:
+ loc += 1
+ second_mask = df.iloc[:, loc:]
+ return pandas.concat([first_mask, value, second_mask], axis=axis)
+
+ md_frames, pd_frames = create_test_dfs(data, post_fn=post_fn)
+ md_item1, md_item2 = md_frames
+ pd_item1, pd_item2 = pd_frames
+
+ index_loc = get_loc(pd_item1, loc)
+
+ pd_res = get_reference(pd_item1, loc=index_loc, value=pd_item2)
+ md_res = md_item1._query_compiler.insert_item(
+ axis=axis, loc=index_loc, value=md_item2._query_compiler, replace=replace
+ ).to_pandas()
+ # breakpoint()
+ df_equals(md_res, pd_res)
+
+ index_loc = get_loc(pd_item2, loc)
+
+ pd_res = get_reference(pd_item2, loc=index_loc, value=pd_item1)
+ md_res = md_item2._query_compiler.insert_item(
+ axis=axis, loc=index_loc, value=md_item1._query_compiler, replace=replace
+ ).to_pandas()
+
+ df_equals(md_res, pd_res)
diff --git a/modin/test/backends/pandas/test_internals.py b/modin/test/backends/pandas/test_internals.py
new file mode 100644
index 00000000000..19b843ca33c
--- /dev/null
+++ b/modin/test/backends/pandas/test_internals.py
@@ -0,0 +1,55 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import modin.pandas as pd
+from modin.pandas.test.utils import create_test_dfs
+from modin.config import NPartitions
+
+NPartitions.put(4)
+
+
+def test_aligning_blocks():
+ # Test problem when modin frames have the same number of rows, but different
+ # blocks (partition.list_of_blocks). See #2322 for details
+ accm = pd.DataFrame(["-22\n"] * 162)
+ accm = accm.iloc[2:, :]
+ accm.reset_index(drop=True, inplace=True)
+ accm["T"] = pd.Series(["24.67\n"] * 145)
+
+ # see #2322 for details
+ repr(accm)
+
+
+def test_aligning_blocks_with_duplicated_index():
+ # Same problem as in `test_aligning_blocks` but with duplicated values in index.
+ data11 = [0, 1]
+ data12 = [2, 3]
+
+ data21 = [0]
+ data22 = [1, 2, 3]
+
+ df1 = pd.DataFrame(data11).append(pd.DataFrame(data12))
+ df2 = pd.DataFrame(data21).append(pd.DataFrame(data22))
+
+ repr(df1 - df2)
+
+
+def test_aligning_partitions():
+ data = [0, 1, 2, 3, 4, 5]
+ modin_df1, _ = create_test_dfs({"a": data, "b": data})
+ modin_df = modin_df1.loc[:2]
+
+ modin_df2 = modin_df.append(modin_df)
+
+ modin_df2["c"] = modin_df1["b"]
+ repr(modin_df2)
diff --git a/modin/test/test_envvar_npartitions.py b/modin/test/test_envvar_npartitions.py
new file mode 100644
index 00000000000..e2d0db8570a
--- /dev/null
+++ b/modin/test/test_envvar_npartitions.py
@@ -0,0 +1,46 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import modin.pandas as pd
+import numpy as np
+import pytest
+
+from modin.config import NPartitions
+
+
+@pytest.mark.parametrize("num_partitions", [2, 4, 6, 8, 10])
+def test_set_npartitions(num_partitions):
+ NPartitions.put(num_partitions)
+ data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8))
+ df = pd.DataFrame(data)
+ part_shape = df._query_compiler._modin_frame._partitions.shape
+ assert part_shape[0] == num_partitions and part_shape[1] == min(num_partitions, 8)
+
+
+@pytest.mark.parametrize("left_num_partitions", [2, 4, 6, 8, 10])
+@pytest.mark.parametrize("right_num_partitions", [2, 4, 6, 8, 10])
+def test_runtime_change_npartitions(left_num_partitions, right_num_partitions):
+ NPartitions.put(left_num_partitions)
+ data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8))
+ left_df = pd.DataFrame(data)
+ part_shape = left_df._query_compiler._modin_frame._partitions.shape
+ assert part_shape[0] == left_num_partitions and part_shape[1] == min(
+ left_num_partitions, 8
+ )
+
+ NPartitions.put(right_num_partitions)
+ right_df = pd.DataFrame(data)
+ part_shape = right_df._query_compiler._modin_frame._partitions.shape
+ assert part_shape[0] == right_num_partitions and part_shape[1] == min(
+ right_num_partitions, 8
+ )
diff --git a/modin/test/test_partition_api.py b/modin/test/test_partition_api.py
new file mode 100644
index 00000000000..bfefb5b9b62
--- /dev/null
+++ b/modin/test/test_partition_api.py
@@ -0,0 +1,98 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership. The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import numpy as np
+import pandas
+import pytest
+
+import modin.pandas as pd
+from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions
+from modin.config import Engine, NPartitions
+from modin.pandas.test.utils import df_equals
+
+
+if Engine.get() == "Ray":
+ import ray
+if Engine.get() == "Dask":
+ from distributed.client import get_client
+
+NPartitions.put(4)
+
+
+@pytest.mark.parametrize("axis", [None, 0, 1])
+def test_unwrap_partitions(axis):
+ data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8))
+ df = pd.DataFrame(data)
+
+ if axis is None:
+ expected_partitions = df._query_compiler._modin_frame._partitions
+ actual_partitions = np.array(unwrap_partitions(df, axis=axis))
+ assert (
+ expected_partitions.shape[0] == actual_partitions.shape[0]
+ and expected_partitions.shape[1] == expected_partitions.shape[1]
+ )
+ for row_idx in range(expected_partitions.shape[0]):
+ for col_idx in range(expected_partitions.shape[1]):
+ if Engine.get() == "Ray":
+ assert (
+ expected_partitions[row_idx][col_idx].oid
+ == actual_partitions[row_idx][col_idx]
+ )
+ if Engine.get() == "Dask":
+ assert (
+ expected_partitions[row_idx][col_idx].future
+ == actual_partitions[row_idx][col_idx]
+ )
+ else:
+ expected_axis_partitions = (
+ df._query_compiler._modin_frame._frame_mgr_cls.axis_partition(
+ df._query_compiler._modin_frame._partitions, axis ^ 1
+ )
+ )
+ expected_axis_partitions = [
+ axis_partition.coalesce().unwrap(squeeze=True)
+ for axis_partition in expected_axis_partitions
+ ]
+ actual_axis_partitions = unwrap_partitions(df, axis=axis)
+ assert len(expected_axis_partitions) == len(actual_axis_partitions)
+ for item_idx in range(len(expected_axis_partitions)):
+ if Engine.get() == "Ray":
+ df_equals(
+ ray.get(expected_axis_partitions[item_idx]),
+ ray.get(actual_axis_partitions[item_idx]),
+ )
+ if Engine.get() == "Dask":
+ df_equals(
+ expected_axis_partitions[item_idx].result(),
+ actual_axis_partitions[item_idx].result(),
+ )
+
+
+@pytest.mark.parametrize("axis", [None, 0, 1])
+def test_from_partitions(axis):
+ data = np.random.randint(0, 100, size=(2 ** 16, 2 ** 8))
+ df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data)
+ expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis)
+ if Engine.get() == "Ray":
+ if axis is None:
+ futures = [[ray.put(df1), ray.put(df2)]]
+ else:
+ futures = [ray.put(df1), ray.put(df2)]
+ if Engine.get() == "Dask":
+ client = get_client()
+ if axis is None:
+ futures = [client.scatter([df1, df2], hash=False)]
+ else:
+ futures = client.scatter([df1, df2], hash=False)
+ actual_df = from_partitions(futures, axis)
+ df_equals(expected_df, actual_df)
diff --git a/modin/utils.py b/modin/utils.py
index 8d2f2e96072..27be884735f 100644
--- a/modin/utils.py
+++ b/modin/utils.py
@@ -81,7 +81,7 @@ def hashable(obj):
return True
-def try_cast_to_pandas(obj):
+def try_cast_to_pandas(obj, squeeze=False):
"""
Converts obj and all nested objects from modin to pandas if it is possible,
otherwise returns obj
@@ -96,13 +96,25 @@ def try_cast_to_pandas(obj):
Converted object
"""
if hasattr(obj, "_to_pandas"):
- return obj._to_pandas()
+ result = obj._to_pandas()
+ if squeeze:
+ result = result.squeeze(axis=1)
+ return result
if hasattr(obj, "to_pandas"):
- return obj.to_pandas()
+ result = obj.to_pandas()
+ if squeeze:
+ result = result.squeeze(axis=1)
+ # Query compiler case, it doesn't have logic about convertion to Series
+ if (
+ isinstance(getattr(result, "name", None), str)
+ and result.name == "__reduced__"
+ ):
+ result.name = None
+ return result
if isinstance(obj, (list, tuple)):
- return type(obj)([try_cast_to_pandas(o) for o in obj])
+ return type(obj)([try_cast_to_pandas(o, squeeze=squeeze) for o in obj])
if isinstance(obj, dict):
- return {k: try_cast_to_pandas(v) for k, v in obj.items()}
+ return {k: try_cast_to_pandas(v, squeeze=squeeze) for k, v in obj.items()}
if callable(obj):
module_hierarchy = getattr(obj, "__module__", "").split(".")
fn_name = getattr(obj, "__name__", None)
diff --git a/requirements.txt b/requirements-dev.txt
similarity index 77%
rename from requirements.txt
rename to requirements-dev.txt
index 4b7640f6b3c..6eb1163526f 100644
--- a/requirements.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
-pandas==1.1.3
-numpy
-pyarrow<0.17
+pandas==1.2.1
+numpy>=1.16.5,<1.20 # pandas gh-39513
+pyarrow>=1.0.0
dask[complete]>=2.12.0,<=2.19.0
distributed>=2.12.0,<=2.19.0
ray>=1.0.0
@@ -26,3 +26,5 @@ msgpack
pandas_gbq
cloudpickle
rpyc==4.1.5
+asv
+xgboost>=1.3
diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml
index cf615ea66de..6da59923b68 100644
--- a/requirements/env_omnisci.yml
+++ b/requirements/env_omnisci.yml
@@ -3,8 +3,9 @@ channels:
- intel/label/modin
- conda-forge
dependencies:
- - pandas==1.1.3
- - numpy
+ - pandas==1.2.1
+ - pyarrow==1.0.0
+ - numpy>=1.16.5,<1.20 # pandas gh-39513
- pip
- pytest>=6.0.1
- pytest-cov>=2.10.1
@@ -12,5 +13,7 @@ dependencies:
- coverage<5.0
- pygithub==1.53
- omniscidbe4py
- - pip:
- - ray>=1.0.0
+ - s3fs>=0.4.2
+ - ray-core >=1.0.0
+ - openpyxl
+ - xlrd
diff --git a/setup.py b/setup.py
index 15f350c4603..d270deeb671 100644
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,8 @@ def is_pure(self):
return False
-dask_deps = ["dask>=2.12.0", "distributed>=2.12.0"]
-ray_deps = ["ray>=1.0.0", "pyarrow<0.17"]
+dask_deps = ["dask>=2.12.0,<=2.19.0", "distributed>=2.12.0,<=2.19.0"]
+ray_deps = ["ray>=1.0.0", "pyarrow==1.0"]
remote_deps = ["rpyc==4.1.5", "cloudpickle==1.4.1", "boto3==1.4.8"]
all_deps = dask_deps + ray_deps + remote_deps
@@ -55,7 +55,7 @@ def is_pure(self):
url="https://github.com/modin-project/modin",
long_description=long_description,
long_description_content_type="text/markdown",
- install_requires=["pandas==1.1.3", "packaging"],
+ install_requires=["pandas==1.2.1", "packaging", "numpy>=1.16.5,<1.20"],
extras_require={
# can be installed by pip install modin[dask]
"dask": dask_deps,
@@ -63,5 +63,5 @@ def is_pure(self):
"remote": remote_deps,
"all": all_deps,
},
- python_requires=">=3.6.1",
+ python_requires=">=3.7.1",
)
diff --git a/stress_tests/kaggle/kaggle14.py b/stress_tests/kaggle/kaggle14.py
index 21251cab191..82a34c98779 100755
--- a/stress_tests/kaggle/kaggle14.py
+++ b/stress_tests/kaggle/kaggle14.py
@@ -50,7 +50,7 @@
data["Initial"] = 0
for i in data:
data["Initial"] = data.Name.str.extract(
- "([A-Za-z]+)\." # noqa: W605
+ r"([A-Za-z]+)\." # noqa: W605
) # lets extract the Salutations
pd.crosstab(data.Initial, data.Sex).T.style.background_gradient(
cmap="summer_r"
diff --git a/stress_tests/kaggle/kaggle4.py b/stress_tests/kaggle/kaggle4.py
index fa26a1c9ece..e0280d5732b 100755
--- a/stress_tests/kaggle/kaggle4.py
+++ b/stress_tests/kaggle/kaggle4.py
@@ -51,7 +51,7 @@ def ignore_warn(*args, **kwargs):
(mu, sigma) = norm.fit(train["SalePrice"])
print("\n mu = {:.2f} and sigma = {:.2f}\n".format(mu, sigma))
plt.legend(
- ["Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)],
+ [r"Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)],
loc="best", # noqa: W605
)
plt.ylabel("Frequency")
@@ -64,7 +64,7 @@ def ignore_warn(*args, **kwargs):
(mu, sigma) = norm.fit(train["SalePrice"])
print("\n mu = {:.2f} and sigma = {:.2f}\n".format(mu, sigma))
plt.legend(
- ["Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)],
+ [r"Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)],
loc="best", # noqa: W605
)
plt.ylabel("Frequency")
diff --git a/stress_tests/kaggle/kaggle5.py b/stress_tests/kaggle/kaggle5.py
index 5feeb6585b1..67dee1c27c8 100755
--- a/stress_tests/kaggle/kaggle5.py
+++ b/stress_tests/kaggle/kaggle5.py
@@ -53,7 +53,7 @@
"After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape
for dataset in combine:
dataset["Title"] = dataset.Name.str.extract(
- " ([A-Za-z]+)\.", expand=False
+ r" ([A-Za-z]+)\.", expand=False
) # noqa: W605
pd.crosstab(train_df["Title"], train_df["Sex"])
for dataset in combine: