Skip to content

Commit a9caab7

Browse files
authored
feat: new-york-city-taxi-fare-prediction_template (#488)
* copy init version * feat: new-york-city-taxi-fare-prediction_template * add move to linear model * Add more details about docker * auto lint * auto lint with new black
1 parent f6c522b commit a9caab7

15 files changed

Lines changed: 339 additions & 4 deletions

File tree

rdagent/components/proposal/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020

2121
class LLMHypothesisGen(HypothesisGen):
22-
2322
def __init__(self, scen: Scenario):
2423
super().__init__(scen)
2524

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import os
2+
3+
import numpy as np
4+
import pandas as pd
5+
from sklearn.impute import SimpleImputer
6+
from sklearn.model_selection import train_test_split
7+
8+
index_name = "key"
9+
label_name = "fare_amount"
10+
11+
12+
def prepreprocess():
13+
"""
14+
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
15+
"""
16+
# Load and preprocess the data
17+
data_df = pd.read_csv("/kaggle/input/train.csv")
18+
data_df = data_df.drop([index_name], axis=1)
19+
20+
X = data_df.drop([label_name], axis=1)
21+
y = data_df[label_name]
22+
23+
# Split the data into training and validation sets
24+
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
25+
26+
return X_train, X_valid, y_train, y_valid
27+
28+
29+
def preprocess_script():
30+
"""
31+
This method applies the preprocessing steps to the training, validation, and test datasets.
32+
"""
33+
if os.path.exists("/kaggle/input/X_train.pkl"):
34+
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
35+
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
36+
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
37+
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
38+
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
39+
others = pd.read_pickle("/kaggle/input/others.pkl")
40+
41+
return X_train, X_valid, y_train, y_valid, X_test, *others
42+
43+
X_train, X_valid, y_train, y_valid = prepreprocess()
44+
45+
# Load and preprocess the test data
46+
submission_df = pd.read_csv("/kaggle/input/test.csv")
47+
ids = submission_df[index_name]
48+
X_test = submission_df.drop([index_name], axis=1)
49+
50+
return X_train, X_valid, y_train, y_valid, X_test, ids
51+
52+
53+
def clean_and_impute_data(X_train, X_valid, X_test):
54+
"""
55+
Handles inf and -inf values by replacing them with NaN,
56+
then imputes missing values using the mean strategy.
57+
Also removes duplicate columns.
58+
"""
59+
# Replace inf and -inf with NaN
60+
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
61+
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
62+
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
63+
64+
# Impute missing values
65+
imputer = SimpleImputer(strategy="mean")
66+
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
67+
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
68+
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
69+
70+
# Remove duplicate columns
71+
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
72+
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
73+
X_test = X_test.loc[:, ~X_test.columns.duplicated()]
74+
75+
return X_train, X_valid, X_test
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import pandas as pd
2+
3+
"""
4+
Here is the feature engineering code for each task, with a class that has a fit and transform method.
5+
Remember
6+
"""
7+
8+
9+
class DatetimeFeature:
10+
def fit(self, train_df: pd.DataFrame):
11+
"""
12+
Fit the feature engineering model to the training data.
13+
"""
14+
pass
15+
16+
def transform(self, X: pd.DataFrame):
17+
"""
18+
Transform the input data.
19+
"""
20+
X["pickup_datetime"] = pd.to_datetime(X["pickup_datetime"], format="%Y-%m-%d %H:%M:%S UTC")
21+
X["hour"] = X.pickup_datetime.dt.hour
22+
X["day"] = X.pickup_datetime.dt.day
23+
X["month"] = X.pickup_datetime.dt.month
24+
X["weekday"] = X.pickup_datetime.dt.weekday
25+
X["year"] = X.pickup_datetime.dt.year
26+
X.drop(columns=["pickup_datetime"], inplace=True)
27+
return X
28+
29+
30+
feature_engineering_cls = DatetimeFeature
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Motivation of the model:
3+
The Linear Regression model is chosen for its simplicity and interpretability. It is a good starting point for regression tasks
4+
and provides a baseline to compare more complex models against. Linear Regression assumes a linear relationship between the
5+
features and the target variable, which can be a reasonable assumption for many problems.
6+
"""
7+
8+
import pandas as pd
9+
from sklearn.linear_model import LinearRegression
10+
from sklearn.metrics import mean_squared_error
11+
12+
13+
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
14+
"""
15+
Define and train the Linear Regression model. Merge feature selection into the pipeline.
16+
"""
17+
# Initialize the Linear Regression model
18+
model = LinearRegression()
19+
20+
# Fit the model
21+
model.fit(X_train, y_train)
22+
23+
# Validate the model
24+
y_valid_pred = model.predict(X_valid)
25+
mse = mean_squared_error(y_valid, y_valid_pred)
26+
print(f"Validation Mean Squared Error: {mse:.4f}")
27+
28+
return model
29+
30+
31+
def predict(model, X):
32+
"""
33+
Keep feature selection's consistency and make predictions.
34+
"""
35+
# Predict using the trained model
36+
y_pred = model.predict(X)
37+
38+
return y_pred.reshape(-1, 1)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
3+
4+
def select(X: pd.DataFrame) -> pd.DataFrame:
5+
"""
6+
Select relevant features. To be used in fit & predict function.
7+
"""
8+
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
9+
if X.columns.nlevels == 1:
10+
return X
11+
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
12+
return X
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import importlib.util
2+
import random
3+
from pathlib import Path
4+
5+
import numpy as np
6+
import pandas as pd
7+
from fea_share_preprocess import clean_and_impute_data, preprocess_script
8+
from sklearn.metrics import matthews_corrcoef, root_mean_squared_error
9+
10+
# Set random seed for reproducibility
11+
SEED = 42
12+
random.seed(SEED)
13+
np.random.seed(SEED)
14+
DIRNAME = Path(__file__).absolute().resolve().parent
15+
16+
17+
def compute_metrics_for_classification(y_true, y_pred):
18+
"""Compute MCC for classification."""
19+
mcc = matthews_corrcoef(y_true, y_pred)
20+
return mcc
21+
22+
23+
def import_module_from_path(module_name, module_path):
24+
spec = importlib.util.spec_from_file_location(module_name, module_path)
25+
module = importlib.util.module_from_spec(spec)
26+
spec.loader.exec_module(module)
27+
return module
28+
29+
30+
# 1) Preprocess the data
31+
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
32+
33+
# 2) Auto feature engineering
34+
X_train_l, X_valid_l = [], []
35+
X_test_l = []
36+
37+
for f in DIRNAME.glob("feature/feat*.py"):
38+
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
39+
cls.fit(X_train)
40+
X_train_f = cls.transform(X_train)
41+
X_valid_f = cls.transform(X_valid)
42+
X_test_f = cls.transform(X_test)
43+
44+
if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
45+
X_train_l.append(X_train_f)
46+
X_valid_l.append(X_valid_f)
47+
X_test_l.append(X_test_f)
48+
49+
X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
50+
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
51+
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
52+
53+
print(X_train.shape, X_valid.shape, X_test.shape)
54+
55+
# Handle inf and -inf values
56+
X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)
57+
58+
59+
model_l = [] # list[tuple[model, predict_func]]
60+
for f in DIRNAME.glob("model/model*.py"):
61+
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
62+
select_m = import_module_from_path(select_python_path.stem, select_python_path)
63+
X_train_selected = select_m.select(X_train.copy())
64+
X_valid_selected = select_m.select(X_valid.copy())
65+
66+
m = import_module_from_path(f.stem, f)
67+
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
68+
69+
# 4) Evaluate the model on the validation set
70+
metrics_all = []
71+
for model, predict_func, select_m in model_l:
72+
X_valid_selected = select_m.select(X_valid.copy())
73+
y_valid_pred = predict_func(model, X_valid_selected)
74+
rmse = root_mean_squared_error(y_valid, y_valid_pred)
75+
print(f"final root mean squared error on valid set: {rmse}")
76+
metrics_all.append(rmse)
77+
78+
# 5) Save the validation accuracy
79+
min_index = np.argmin(metrics_all)
80+
pd.Series(data=[metrics_all[min_index]], index=["root mean squared error"]).to_csv("submission_score.csv")
81+
82+
# 6) Make predictions on the test set and save them
83+
X_test_selected = model_l[min_index][2].select(X_test.copy())
84+
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1
85+
86+
87+
# 7) Submit predictions for the test set
88+
submission_result = pd.DataFrame(y_test_pred, columns=["fare_amount"])
89+
submission_result.insert(0, "key", ids)
90+
91+
submission_result.to_csv("submission.csv", index=False)

0 commit comments

Comments
 (0)