This repository was archived by the owner on May 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
This repository was archived by the owner on May 9, 2024. It is now read-only.
Assertion failed on taxi and plasticc benchmarks #458
Copy link
Copy link
Closed
Description
When running taxi (taxi modified to work with header csv input) and plasticc benchmarks on HDK debug build I encounter failed assertion QueryFragmentDescriptor.cpp:297 Check failed: table_info
. This happens in the same place both Linux and Windows so it is not a windows specific problem.
Code to reproduce:
import sys
import time
import json
from collections import OrderedDict
import modin.pandas as pd
def read(filename):
column_types = {
"trip_id": "int64",
"vendor_id": "string",
"pickup_datetime": "timestamp",
"dropoff_datetime": "timestamp",
"store_and_fwd_flag": "string",
"rate_code_id": "int64",
"pickup_longitude": "float64",
"pickup_latitude": "float64",
"dropoff_longitude": "float64",
"dropoff_latitude": "float64",
"passenger_count": "int64",
"trip_distance": "float64",
"fare_amount": "float64",
"extra": "float64",
"mta_tax": "float64",
"tip_amount": "float64",
"tolls_amount": "float64",
"ehail_fee": "float64",
"improvement_surcharge": "float64",
"total_amount": "float64",
"payment_type": "string",
"trip_type": "float64",
"pickup": "string",
"dropoff": "string",
"cab_type": "string",
"precipitation": "float64",
"snow_depth": "int64",
"snowfall": "float64",
"max_temperature": "int64",
"min_temperature": "int64",
"average_wind_speed": "float64",
"pickup_nyct2010_gid": "float64",
"pickup_ctlabel": "float64",
"pickup_borocode": "float64",
"pickup_boroname": "string",
"pickup_ct2010": "float64",
"pickup_boroct2010": "float64",
"pickup_cdeligibil": "string",
"pickup_ntacode": "string",
"pickup_ntaname": "string",
"pickup_puma": "float64",
"dropoff_nyct2010_gid": "float64",
"dropoff_ctlabel": "float64",
"dropoff_borocode": "float64",
"dropoff_boroname": "string",
"dropoff_ct2010": "float64",
"dropoff_boroct2010": "float64",
"dropoff_cdeligibil": "string",
"dropoff_ntacode": "string",
"dropoff_ntaname": "string",
"dropoff_puma": "float64",
}
all_but_dates = {
col: valtype
for (col, valtype) in column_types.items()
if valtype not in ["timestamp"]
}
dates_only = [
col for (col, valtype) in column_types.items() if valtype in ["timestamp"]
]
df = pd.read_csv(
filename,
header=0,
dtype=all_but_dates,
parse_dates=dates_only,
)
df.shape # to trigger real execution on omnisci
return df
def q1_omnisci(df):
q1_pandas_output = df.groupby("cab_type").size()
q1_pandas_output.shape # to trigger real execution on omnisci
return q1_pandas_output
def q2_omnisci(df):
q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"})
q2_pandas_output.shape # to trigger real execution on omnisci
return q2_pandas_output
def q3_omnisci(df):
df["pickup_datetime"] = df["pickup_datetime"].dt.year
q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size()
q3_pandas_output.shape # to trigger real execution on omnisci
return q3_pandas_output
def q4_omnisci(df):
df["pickup_datetime"] = df["pickup_datetime"].dt.year
df["trip_distance"] = df["trip_distance"].astype("int64")
q4_pandas_output = (
df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False)
.size()
.reset_index()
.sort_values(
by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False]
)
)
q4_pandas_output.shape # to trigger real execution on omnisci
return q4_pandas_output
def measure(func, *args, **kw):
t0 = time.time()
res = func(*args, **kw)
t1 = time.time()
return res, t1 - t0
def run(input_file):
res = OrderedDict()
df, res["Reading"] = measure(read, input_file)
_, res["Q1"] = measure(q1_omnisci, df)
_, res["Q2"] = measure(q2_omnisci, df)
_, res["Q3"] = measure(q3_omnisci, df.copy())
_, res["Q4"] = measure(q4_omnisci, df.copy())
return res
def main():
if len(sys.argv) != 2:
print(
f"USAGE: python taxi.py <data file name>"
)
return
result = run(sys.argv[1])
json.dump(result, sys.stdout, indent=4)
if __name__ == "__main__":
main()
Datafile that contains a header and just one line:
trip_id,pickup_datetime,dropoff_datetime,rate_code_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_type,cab_type,precipitation,snow_depth,snowfall,max_temperature,min_temperature,average_wind_speed,pickup_nyct2010_gid,pickup_ctlabel,pickup_borocode,pickup_ct2010,pickup_boroct2010,pickup_puma,dropoff_nyct2010_gid,dropoff_ctlabel,dropoff_borocode,dropoff_ct2010,dropoff_boroct2010,dropoff_puma
728589253,2013-03-23 19:00:03,2015-02-09 23:53:19,227,-3384.6504885319628,-1768.4547705633008,899.3313637135943,-3567.838610520537,0,247.59167806792436,283313.3109043807,11958.290443405911,37.04835519254301,285575.9549971816,5595.173567982231,34.00429082233286,611256.349785416,1.2010861125030665,green,2.737725308004807,8,26.25521086711126,27,15,7.959928315627986,1540.786264683745,1.926857212032628,3.528783788433975,73450.27117442139,4394007.527430953,4035.594680159191,429.3012304384143,5400.04327998597,2.3620759492837418,146869.91997436076,1023130.9389877942,3812.462431689403
Metadata
Metadata
Assignees
Labels
No labels