Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.
This repository was archived by the owner on May 9, 2024. It is now read-only.

Assertion failed on taxi and plasticc benchmarks #458

@gshimansky

Description

@gshimansky

When running taxi (taxi modified to work with header csv input) and plasticc benchmarks on HDK debug build I encounter failed assertion QueryFragmentDescriptor.cpp:297 Check failed: table_info. This happens in the same place both Linux and Windows so it is not a windows specific problem.

Code to reproduce:

import sys
import time
import json
from collections import OrderedDict
import modin.pandas as pd


def read(filename):
    column_types = {
        "trip_id": "int64",
        "vendor_id": "string",
        "pickup_datetime": "timestamp",
        "dropoff_datetime": "timestamp",
        "store_and_fwd_flag": "string",
        "rate_code_id": "int64",
        "pickup_longitude": "float64",
        "pickup_latitude": "float64",
        "dropoff_longitude": "float64",
        "dropoff_latitude": "float64",
        "passenger_count": "int64",
        "trip_distance": "float64",
        "fare_amount": "float64",
        "extra": "float64",
        "mta_tax": "float64",
        "tip_amount": "float64",
        "tolls_amount": "float64",
        "ehail_fee": "float64",
        "improvement_surcharge": "float64",
        "total_amount": "float64",
        "payment_type": "string",
        "trip_type": "float64",
        "pickup": "string",
        "dropoff": "string",
        "cab_type": "string",
        "precipitation": "float64",
        "snow_depth": "int64",
        "snowfall": "float64",
        "max_temperature": "int64",
        "min_temperature": "int64",
        "average_wind_speed": "float64",
        "pickup_nyct2010_gid": "float64",
        "pickup_ctlabel": "float64",
        "pickup_borocode": "float64",
        "pickup_boroname": "string",
        "pickup_ct2010": "float64",
        "pickup_boroct2010": "float64",
        "pickup_cdeligibil": "string",
        "pickup_ntacode": "string",
        "pickup_ntaname": "string",
        "pickup_puma": "float64",
        "dropoff_nyct2010_gid": "float64",
        "dropoff_ctlabel": "float64",
        "dropoff_borocode": "float64",
        "dropoff_boroname": "string",
        "dropoff_ct2010": "float64",
        "dropoff_boroct2010": "float64",
        "dropoff_cdeligibil": "string",
        "dropoff_ntacode": "string",
        "dropoff_ntaname": "string",
        "dropoff_puma": "float64",
    }

    all_but_dates = {
        col: valtype
        for (col, valtype) in column_types.items()
        if valtype not in ["timestamp"]
    }
    dates_only = [
        col for (col, valtype) in column_types.items() if valtype in ["timestamp"]
    ]

    df = pd.read_csv(
        filename,
        header=0,
        dtype=all_but_dates,
        parse_dates=dates_only,
    )

    df.shape  # to trigger real execution on omnisci
    return df


def q1_omnisci(df):
    q1_pandas_output = df.groupby("cab_type").size()
    q1_pandas_output.shape  # to trigger real execution on omnisci
    return q1_pandas_output


def q2_omnisci(df):
    q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"})
    q2_pandas_output.shape  # to trigger real execution on omnisci
    return q2_pandas_output


def q3_omnisci(df):
    df["pickup_datetime"] = df["pickup_datetime"].dt.year
    q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size()
    q3_pandas_output.shape  # to trigger real execution on omnisci
    return q3_pandas_output


def q4_omnisci(df):
    df["pickup_datetime"] = df["pickup_datetime"].dt.year
    df["trip_distance"] = df["trip_distance"].astype("int64")
    q4_pandas_output = (
        df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False)
        .size()
        .reset_index()
        .sort_values(
            by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False]
        )
    )
    q4_pandas_output.shape  # to trigger real execution on omnisci
    return q4_pandas_output


def measure(func, *args, **kw):
    t0 = time.time()
    res = func(*args, **kw)
    t1 = time.time()
    return res, t1 - t0


def run(input_file):
    res = OrderedDict()
    df, res["Reading"] = measure(read, input_file)
    _, res["Q1"] = measure(q1_omnisci, df)
    _, res["Q2"] = measure(q2_omnisci, df)
    _, res["Q3"] = measure(q3_omnisci, df.copy())
    _, res["Q4"] = measure(q4_omnisci, df.copy())
    return res


def main():
    if len(sys.argv) != 2:
        print(
            f"USAGE: python taxi.py <data file name>"
        )
        return
    result = run(sys.argv[1])
    json.dump(result, sys.stdout, indent=4)


if __name__ == "__main__":
    main()

Datafile that contains a header and just one line:

trip_id,pickup_datetime,dropoff_datetime,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_type,cab_type,precipitation,snow_depth,snowfall,max_temperature,min_temperature,average_wind_speed,pickup_nyct2010_gid,pickup_ctlabel,pickup_borocode,pickup_ct2010,pickup_boroct2010,pickup_puma,dropoff_nyct2010_gid,dropoff_ctlabel,dropoff_borocode,dropoff_ct2010,dropoff_boroct2010,dropoff_puma
728589253,2013-03-23 19:00:03,2015-02-09 23:53:19,227,-3384.6504885319628,-1768.4547705633008,899.3313637135943,-3567.838610520537,0,247.59167806792436,283313.3109043807,11958.290443405911,37.04835519254301,285575.9549971816,5595.173567982231,34.00429082233286,611256.349785416,1.2010861125030665,green,2.737725308004807,8,26.25521086711126,27,15,7.959928315627986,1540.786264683745,1.926857212032628,3.528783788433975,73450.27117442139,4394007.527430953,4035.594680159191,429.3012304384143,5400.04327998597,2.3620759492837418,146869.91997436076,1023130.9389877942,3812.462431689403

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions